Skip to content

Commit

Permalink
Merge 44c6ae4 into da3b197
Browse files Browse the repository at this point in the history
  • Loading branch information
MilesCranmer committed Mar 9, 2022
2 parents da3b197 + 44c6ae4 commit 25d3bd8
Show file tree
Hide file tree
Showing 20 changed files with 196 additions and 95 deletions.
2 changes: 1 addition & 1 deletion Project.toml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
name = "SymbolicRegression"
uuid = "8254be44-1295-4e6a-a16d-46603ac705cb"
authors = ["MilesCranmer <miles.cranmer@gmail.com>"]
version = "0.7.14"
version = "0.8.0"

[deps]
Distributed = "8ba89e20-285c-5b6f-9357-94700520ee1b"
Expand Down
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -61,10 +61,10 @@ println("Complexity\tMSE\tEquation")

for member in dominating
size = countNodes(member.tree)
score = member.score
loss = member.loss
string = stringTree(member.tree, options)

println("$(size)\t$(score)\t$(string)")
println("$(size)\t$(loss)\t$(string)")
end
```

Expand Down
4 changes: 2 additions & 2 deletions docs/src/types.md
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ Node(op::Int, l::Union{AbstractFloat, Int}, r::Union{AbstractFloat, Int})
## Population

Groups of equations are given as a population, which is
an array of trees tagged with score and birthdate---these
an array of trees tagged with score, loss, and birthdate---these
values are given in the `PopMember`.

```@docs
Expand All @@ -39,7 +39,7 @@ Population(X::AbstractMatrix{T}, y::AbstractVector{T}, baseline::T;

## Population members
```@docs
PopMember(t::Node, score::T) where {T<:Real}
PopMember(t::Node, score::T, loss::T) where {T<:Real}
PopMember(dataset::Dataset{T}, baseline::T, t::Node, options::Options) where {T<:Real}
```

Expand Down
5 changes: 3 additions & 2 deletions src/ConstantOptimization.jl
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,8 @@ import Optim
function optFunc(x::Vector{CONST_TYPE}, dataset::Dataset{T}, baseline::T,
tree::Node, options::Options; allow_diff=false)::T where {T<:Real}
setConstants(tree, x)
return scoreFunc(dataset, baseline, tree, options; allow_diff=allow_diff)
loss = scoreFunc(dataset, baseline, tree, options; allow_diff=allow_diff)[2]
return loss
end

# Use Nelder-Mead to optimize the constants in an equation
Expand Down Expand Up @@ -60,7 +61,7 @@ function optimizeConstants(dataset::Dataset{T},

if Optim.converged(result)
setConstants(member.tree, result.minimizer)
member.score = convert(T, result.minimum)
member.score, member.loss = scoreFunc(dataset, baseline, member.tree, options)
member.birth = getTime()
else
setConstants(member.tree, x0)
Expand Down
10 changes: 5 additions & 5 deletions src/HallOfFame.jl
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ has been instantiated or not.
"""
function HallOfFame(options::Options)
actualMaxsize = options.maxsize + MAX_DEGREE
HallOfFame([PopMember(Node(convert(CONST_TYPE, 1)), 1f9) for i=1:actualMaxsize], [false for i=1:actualMaxsize])
HallOfFame([PopMember(Node(convert(CONST_TYPE, 1)), 1f9, 1f9) for i=1:actualMaxsize], [false for i=1:actualMaxsize])
end


Expand All @@ -51,7 +51,7 @@ function calculateParetoFrontier(dataset::Dataset{T},
continue
end
simpler_member = hallOfFame.members[i]
if (member.score - size*options.parsimony) >= (simpler_member.score - i*options.parsimony)
if member.loss >= simpler_member.loss
betterThanAllSmaller = false
break
end
Expand Down Expand Up @@ -95,12 +95,12 @@ function string_dominating_pareto_curve(hallOfFame, baselineMSE,
dominating = calculateParetoFrontier(dataset, hallOfFame, options)
for member in dominating
complexity = countNodes(member.tree)
if member.score < 0.0
throw(DomainError(member.score, "Your loss function must be non-negative."))
if member.loss < 0.0
throw(DomainError(member.loss, "Your loss function must be non-negative."))
end
# User higher precision when finding the original loss:
relu(x) = x < 0 ? 0 : x
curMSE = relu(Float64(member.score) - Float64(complexity * options.parsimony)) * Float64(baselineMSE)
curMSE = member.loss

delta_c = complexity - lastComplexity
ZERO_POINT = 1e-10
Expand Down
27 changes: 19 additions & 8 deletions src/LossFunctions.jl
Original file line number Diff line number Diff line change
Expand Up @@ -40,31 +40,42 @@ function EvalLoss(tree::Node, dataset::Dataset{T}, options::Options;
end
end

# Compute a score which includes a complexity penalty in the loss
function lossToScore(loss::T, baseline::T, tree::Node, options::Options)::T where {T<:Real}
normalized_loss_term = loss / baseline
size = countNodes(tree)
parsimony_term = size*options.parsimony

return normalized_loss_term + parsimony_term
end

# Score an equation
function scoreFunc(dataset::Dataset{T},
baseline::T, tree::Node,
options::Options; allow_diff=false)::T where {T<:Real}
mse = EvalLoss(tree, dataset, options; allow_diff=allow_diff)
return mse / baseline + countNodes(tree)*options.parsimony
options::Options; allow_diff=false)::Tuple{T,T} where {T<:Real}
loss = EvalLoss(tree, dataset, options; allow_diff=allow_diff)
score = lossToScore(loss, baseline, tree, options)
return score, loss
end

# Score an equation with a small batch
function scoreFuncBatch(dataset::Dataset{T}, baseline::T,
tree::Node, options::Options)::T where {T<:Real}
tree::Node, options::Options)::Tuple{T,T} where {T<:Real}
batchSize = options.batchSize
batch_idx = randperm(dataset.n)[1:options.batchSize]
batch_X = dataset.X[:, batch_idx]
batch_y = dataset.y[batch_idx]
(prediction, completion) = evalTreeArray(tree, batch_X, options)
if !completion
return T(1000000000)
return T(1000000000), T(1000000000)
end

if !dataset.weighted
mse = Loss(prediction, batch_y, options)
loss = Loss(prediction, batch_y, options)
else
batch_w = dataset.weights[batch_idx]
mse = Loss(prediction, batch_y, batch_w, options)
loss = Loss(prediction, batch_y, batch_w, options)
end
return mse / baseline + countNodes(tree) * options.parsimony
score = lossToScore(loss, baseline, tree, options)
return score, loss
end
45 changes: 25 additions & 20 deletions src/Mutate.jl
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ using FromFile
@from "LossFunctions.jl" import scoreFunc, scoreFuncBatch
@from "CheckConstraints.jl" import check_constraints
@from "PopMember.jl" import PopMember
@from "MutationFunctions.jl" import genRandomTree, mutateConstant, mutateOperator, appendRandomOp, prependRandomOp, insertRandomOp, deleteRandomOp, crossoverTrees
@from "MutationFunctions.jl" import genRandomTreeFixedSize, mutateConstant, mutateOperator, appendRandomOp, prependRandomOp, insertRandomOp, deleteRandomOp, crossoverTrees
@from "SimplifyEquation.jl" import simplifyTree, combineOperators, simplifyWithSymbolicUtils
@from "Recorder.jl" import @recorder

Expand All @@ -22,9 +22,10 @@ function nextGeneration(dataset::Dataset{T},

#TODO - reconsider this
if options.batching
beforeLoss = scoreFuncBatch(dataset, baseline, prev, options)
beforeScore, beforeLoss = scoreFuncBatch(dataset, baseline, prev, options)
else
beforeLoss = member.score
beforeScore = member.score
beforeLoss = member.loss
end

nfeatures = dataset.nfeatures
Expand Down Expand Up @@ -102,14 +103,18 @@ function nextGeneration(dataset::Dataset{T},
@recorder tmp_recorder["type"] = "partial_simplify"
end
mutation_accepted = true
return PopMember(tree, beforeLoss, parent=parent_ref), mutation_accepted
return PopMember(tree, beforeScore, beforeLoss, parent=parent_ref), mutation_accepted

is_success_always_possible = true
# Simplification shouldn't hurt complexity; unless some non-symmetric constraint
# to commutative operator...

elseif mutationChoice < cweights[7]
tree = genRandomTree(5, options, nfeatures) # Sometimes we generate a new tree completely tree
# Sometimes we generate a new tree completely tree
# We select a random size, though the generated tree
# may have fewer nodes than we request.
tree_size_to_generate = rand(1:curmaxsize)
tree = genRandomTreeFixedSize(tree_size_to_generate, options, nfeatures)
@recorder tmp_recorder["type"] = "regenerate"

is_success_always_possible = true
Expand All @@ -120,7 +125,7 @@ function nextGeneration(dataset::Dataset{T},
tmp_recorder["reason"] = "identity"
end
mutation_accepted = true
return PopMember(tree, beforeLoss, parent=parent_ref), mutation_accepted
return PopMember(tree, beforeScore, beforeLoss, parent=parent_ref), mutation_accepted
end

successful_mutation = successful_mutation && check_constraints(tree, options, curmaxsize)
Expand All @@ -135,27 +140,27 @@ function nextGeneration(dataset::Dataset{T},
tmp_recorder["reason"] = "failed_constraint_check"
end
mutation_accepted = false
return PopMember(copyNode(prev), beforeLoss, parent=parent_ref), mutation_accepted
return PopMember(copyNode(prev), beforeScore, beforeLoss, parent=parent_ref), mutation_accepted
end

if options.batching
afterLoss = scoreFuncBatch(dataset, baseline, tree, options)
afterScore, afterLoss = scoreFuncBatch(dataset, baseline, tree, options)
else
afterLoss = scoreFunc(dataset, baseline, tree, options)
afterScore, afterLoss = scoreFunc(dataset, baseline, tree, options)
end

if isnan(afterLoss)
if isnan(afterScore)
@recorder begin
tmp_recorder["result"] = "reject"
tmp_recorder["reason"] = "nan_loss"
end
mutation_accepted = false
return PopMember(copyNode(prev), beforeLoss, parent=parent_ref), mutation_accepted
return PopMember(copyNode(prev), beforeScore, beforeLoss, parent=parent_ref), mutation_accepted
end

probChange = 1.0
if options.annealing
delta = afterLoss - beforeLoss
delta = afterScore - beforeScore
probChange *= exp(-delta/(temperature*options.alpha))
end
if options.useFrequency
Expand All @@ -170,14 +175,14 @@ function nextGeneration(dataset::Dataset{T},
tmp_recorder["reason"] = "annealing_or_frequency"
end
mutation_accepted = false
return PopMember(copyNode(prev), beforeLoss, parent=parent_ref), mutation_accepted
return PopMember(copyNode(prev), beforeScore, beforeLoss, parent=parent_ref), mutation_accepted
else
@recorder begin
tmp_recorder["result"] = "accept"
tmp_recorder["reason"] = "pass"
end
mutation_accepted = true
return PopMember(tree, afterLoss, parent=parent_ref), mutation_accepted
return PopMember(tree, afterScore, afterLoss, parent=parent_ref), mutation_accepted
end
end

Expand Down Expand Up @@ -206,15 +211,15 @@ function crossoverGeneration(member1::PopMember, member2::PopMember, dataset::Da
num_tries += 1
end
if options.batching
afterLoss1 = scoreFuncBatch(dataset, baseline, child_tree1, options)
afterLoss2 = scoreFuncBatch(dataset, baseline, child_tree2, options)
afterScore1, afterLoss1 = scoreFuncBatch(dataset, baseline, child_tree1, options)
afterScore2, afterLoss2 = scoreFuncBatch(dataset, baseline, child_tree2, options)
else
afterLoss1 = scoreFunc(dataset, baseline, child_tree1, options)
afterLoss2 = scoreFunc(dataset, baseline, child_tree2, options)
afterScore1, afterLoss1 = scoreFunc(dataset, baseline, child_tree1, options)
afterScore2, afterLoss2 = scoreFunc(dataset, baseline, child_tree2, options)
end

baby1 = PopMember(child_tree1, afterLoss1, parent=member1.ref)
baby2 = PopMember(child_tree2, afterLoss2, parent=member2.ref)
baby1 = PopMember(child_tree1, afterScore1, afterLoss1, parent=member1.ref)
baby2 = PopMember(child_tree2, afterScore2, afterLoss2, parent=member2.ref)

crossover_accepted = true
return baby1, baby2, crossover_accepted
Expand Down
22 changes: 16 additions & 6 deletions src/MutationFunctions.jl
Original file line number Diff line number Diff line change
Expand Up @@ -78,15 +78,17 @@ function mutateConstant(
end

# Add a random unary/binary operation to the end of a tree
function appendRandomOp(tree::Node, options::Options, nfeatures::Int)::Node
function appendRandomOp(tree::Node, options::Options, nfeatures::Int; makeNewBinOp::Union{Bool,Nothing}=nothing)::Node
node = randomNode(tree)
while node.degree != 0
node = randomNode(tree)
end


choice = rand()
makeNewBinOp = choice < options.nbin/(options.nuna + options.nbin)
if makeNewBinOp === nothing
choice = rand()
makeNewBinOp = choice < options.nbin/(options.nuna + options.nbin)
end

if makeNewBinOp
newnode = Node(
Expand Down Expand Up @@ -265,6 +267,7 @@ end

# Create a random equation by appending random operators
function genRandomTree(length::Int, options::Options, nfeatures::Int)::Node
# Note that this base tree is just a placeholder; it will be replaced.
tree = Node(convert(CONST_TYPE, 1))
for i=1:length
# TODO: This can be larger number of nodes than length.
Expand All @@ -274,9 +277,16 @@ function genRandomTree(length::Int, options::Options, nfeatures::Int)::Node
end

function genRandomTreeFixedSize(node_count::Int, options::Options, nfeatures::Int)::Node
tree = Node(convert(CONST_TYPE, 1))
while countNodes(tree) < node_count
tree = appendRandomOp(tree, options, nfeatures)
tree = makeRandomLeaf(nfeatures)
cur_size = countNodes(tree)
while cur_size < node_count
if cur_size == node_count - 1 # only unary operator allowed.
options.nuna == 0 && break # We will go over the requested amount, so we must break.
tree = appendRandomOp(tree, options, nfeatures; makeNewBinOp=false)
else
tree = appendRandomOp(tree, options, nfeatures)
end
cur_size = countNodes(tree)
end
return tree
end
Expand Down
5 changes: 4 additions & 1 deletion src/Options.jl
Original file line number Diff line number Diff line change
Expand Up @@ -177,6 +177,8 @@ Construct options for `EquationSearch` and other functions.
relative proportion of equations at each complexity; this will
ensure that there are a balanced number of equations considered
for every complexity.
- `useFrequencyInTournament=false`: Whether to use the adaptive parsimony described
above inside the score, rather than just at the mutation accept/reject stage.
- `fast_cycle=false`: Whether to thread over subsamples of equations during
regularized evolution. Slightly improves performance, but is a different
algorithm.
Expand Down Expand Up @@ -242,6 +244,7 @@ function Options(;
crossoverProbability=0.0f0,
warmupMaxsizeBy=0f0,
useFrequency=false,
useFrequencyInTournament=false,
npop=1000,
ncyclesperiteration=300,
fractionReplaced=0.1f0,
Expand Down Expand Up @@ -360,7 +363,7 @@ function Options(;
earlyStopCondition = (loss, complexity) -> loss < earlyStopCondition
end

options = Options{typeof(binary_operators),typeof(unary_operators), typeof(loss)}(binary_operators, unary_operators, bin_constraints, una_constraints, ns, parsimony, alpha, maxsize, maxdepth, fast_cycle, migration, hofMigration, fractionReplacedHof, shouldOptimizeConstants, hofFile, npopulations, perturbationFactor, annealing, batching, batchSize, mutationWeights, crossoverProbability, warmupMaxsizeBy, useFrequency, npop, ncyclesperiteration, fractionReplaced, topn, verbosity, probNegate, nuna, nbin, seed, loss, progress, terminal_width, optimizer_algorithm, optimize_probability, optimizer_nrestarts, optimizer_iterations, recorder, recorder_file, probPickFirst, earlyStopCondition, stateReturn, use_symbolic_utils, timeout_in_seconds, skip_mutation_failures)
options = Options{typeof(binary_operators),typeof(unary_operators), typeof(loss)}(binary_operators, unary_operators, bin_constraints, una_constraints, ns, parsimony, alpha, maxsize, maxdepth, fast_cycle, migration, hofMigration, fractionReplacedHof, shouldOptimizeConstants, hofFile, npopulations, perturbationFactor, annealing, batching, batchSize, mutationWeights, crossoverProbability, warmupMaxsizeBy, useFrequency, useFrequencyInTournament, npop, ncyclesperiteration, fractionReplaced, topn, verbosity, probNegate, nuna, nbin, seed, loss, progress, terminal_width, optimizer_algorithm, optimize_probability, optimizer_nrestarts, optimizer_iterations, recorder, recorder_file, probPickFirst, earlyStopCondition, stateReturn, use_symbolic_utils, timeout_in_seconds, skip_mutation_failures)

@eval begin
Base.print(io::IO, tree::Node) = print(io, stringTree(tree, $options))
Expand Down
1 change: 1 addition & 0 deletions src/OptionsStruct.jl
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ struct Options{A,B,C<:Union{SupervisedLoss,Function}}
crossoverProbability::Float32
warmupMaxsizeBy::Float32
useFrequency::Bool
useFrequencyInTournament::Bool
npop::Int
ncyclesperiteration::Int
fractionReplaced::Float32
Expand Down
Loading

0 comments on commit 25d3bd8

Please sign in to comment.