## Generate random expressions like the bayesian_optimization notebook and save them to file

In [2]:
using SymbolicUtils
using DynamicExpressions
using Optim: Optim
using StatsBase: sample, Weights


function create_node((node_type, payload), operators::OperatorEnum)
    if node_type == "constant"
        return Node{Float64}(; val=rand(Float64))  # one(Float64)
    elseif node_type == "feature"
        return Node{Float64}(; feature=payload)
    else
        n = Node{Float64}()
        degree = node_type == "unary" ? 1 : 2
        n.degree = degree
        if degree == 1
            @assert(
                payload in eachindex(operators.unaops),
                "Could not find unary operator with index $payload"
            )
        else
            @assert(
                payload in eachindex(operators.binops),
                "Could not find binary operator with index $payload"
            )
        end
        n.op = payload
        return n
    end
end

function create_tree(nodes::AbstractVector{Tuple{String,Int}}, operators::OperatorEnum)::Node
    _create_tree!(copy(nodes), operators)
end

function _create_tree!(nodes::AbstractVector{Tuple{String,Int}}, operators::OperatorEnum)::Node
    if isempty(nodes)
        # Fallback is to simply return 1.0
        return create_node(("constant", 0), operators)
    end
    new_root = create_node(popfirst!(nodes), operators)
    if new_root.degree == 0
        # Nothing to do
    elseif new_root.degree == 1
        new_root.l = _create_tree!(nodes, operators)
    elseif new_root.degree == 2
        new_root.l = _create_tree!(nodes, operators)
        new_root.r = _create_tree!(nodes, operators)
    else
        error()
    end
    return new_root
end

function loss(operators::OperatorEnum, X::Matrix{Float64}, y::Vector{Float64})
    let X_t = X'
        function (tree)
            y_predicted = first(eval_tree_array(tree, X_t, operators))
            return sum(i -> abs2(y[i] - y_predicted[i]), eachindex(y, y_predicted))
        end
    end
end

function evaluate(nodes::AbstractVector{Tuple{String,Int}}, operators::OperatorEnum, X::AbstractMatrix{Float64}, y::AbstractVector{Float64})
    tree = create_tree(nodes, operators)
    l = loss(operators, X, y)
    if !has_constants(tree)
        # Nothing to optimize
        return (l(tree), tree)
    end
    res = Optim.optimize(l, tree, Optim.BFGS())
    return (res.minimum, res.minimizer)
end

function possibilities(n_features, operators::OperatorEnum)
    possible_nodes = [
        ("binary", i) for i in eachindex(operators.binops)
    ]
    append!(possible_nodes, [
        ("unary", j) for j in eachindex(operators.unaops)
    ])
    append!(possible_nodes, [
        ("feature", k) for k in 1:n_features
    ])
    append!(possible_nodes, [("constant", 0)])
    return possible_nodes
end

possibilities (generic function with 1 method)

In [3]:
operators = OperatorEnum(binary_operators=(+, -, *), unary_operators=(sin, exp))
num_features = 1
node_types = possibilities(num_features, operators)
println(node_types)

weights = Weights([1, 1, 1, 0.5, 0.5, 2, 0.4])
println(weights)

weights_first = Weights([1, 1, 1, 0.5, 0.5, 0.01, 0])
println(weights_first)

[("binary", 1), ("binary", 2), ("binary", 3), ("unary", 1), ("unary", 2), ("feature", 1), ("constant", 0)]
[1.0, 1.0, 1.0, 0.5, 0.5, 2.0, 0.4]
[1.0, 1.0, 1.0, 0.5, 0.5, 0.01, 0.0]


In [4]:
# Sample list of nodes
n = 5
cnt = 100_000

trees = []
for _ in 1:cnt
    # Make sure first node is not a constant
    first_node = sample(node_types, weights_first, 1)
    node_list = [first_node; sample(node_types, weights, n-1)]
    tree = create_tree(node_list, operators)
    # println(tree)
    simplify_tree!(tree, operators)
    # println(tree)
    # println("-----------")
    if tree.degree > 0
        push!(trees, tree)
    end
end

trees = unique(trees)

53150-element Vector{Any}:
 sin(sin(x1))
 ((x1 - 0.6914618846717281) - 0.9852721251433112) - 0.08464646046007085
 (x1 * x1) - x1
 x1 * 0.915150496639181
 (x1 * x1) * 1.1766125290979907
 x1 + -0.5449750350590586
 0.3392351121778863 + x1
 exp(exp(x1))
 (((x1 + 0.44671198424866765) * 0.9784414178570849) - 0.9527580796164442) * 0.8421678230366046
 (x1 - 0.7888589470028463) * 0.7146740405120758
 ⋮
 ((exp(x1) * 0.28884774583091877) + 0.6519908121268015) * 0.7153064190183511
 ((x1 * 0.12609399104627717) - 0.647191513128363) * 0.2974244331041176
 (sin(x1) + 0.3446762901728785) * 0.8985911394850951
 sin(x1) - (x1 * 0.4625861326169689)
 x1 - -0.8101443430022055
 (x1 - x1) * 0.47436690799625914
 x1 - 0.6373028475394451
 (0.1729007190028713 + (x1 * 0.6099519872152004)) + 0.46878893894843576
 x1 + ((x1 - 0.0958735796680441) * 0.7173702105678573)

In [53]:
# Write trees to a file, one per line
open("../data/expressions.txt", "w") do file
    for tree in trees
        println(file, string(tree))
    end
end


In [131]:
# Read the expressions from the file
expressions = String[]
open("../data/expressions.txt", "r") do file
    for line in eachline(file)
        push!(expressions, String(line))
    end
end

In [235]:
function tree_to_seq(node::GraphNode, operators::OperatorEnum)
    if node.constant
        @info "constant"
        return [string(node.val)]
    elseif node.degree == 0  # Feature
        @info "feature"
        return ["x" * string(node.feature)]
    elseif node.degree == 1  # Unary op
        @info "unary"
        return vcat([string(operators.unaops[node.op])], tree_to_seq(node.l, operators))
    elseif node.degree == 2  # Binary op
        @info "binary"
        return vcat([string(operators.binops[node.op])], tree_to_seq(node.l, operators), tree_to_seq(node.r, operators))
    end
end
for _ in 1:10000
    seq = tree_to_seq(ex.tree, operators)
end

In [246]:
function test(str::String, operators::OperatorEnum, variable_names::Vector{Symbol}, encoding::Vector{String})
    expr = Meta.parse(str)
    ex = parse_expression(expr, operators=operators, variable_names=variable_names, node_type=GraphNode)

    # Parse tree into normal Polish (prefix) notation of fixed length (incl padding)
    function tree_to_seq(node::GraphNode, operators::OperatorEnum)
        if node.constant
            @info "constant"
            return [string(node.val)]
        elseif node.degree == 0  # Feature
            @info "feature"
            return ["x" * string(node.feature)]
        elseif node.degree == 1  # Unary op
            @info "unary"
            return vcat([string(operators.unaops[node.op])], tree_to_seq(node.l, operators))
        elseif node.degree == 2  # Binary op
            @info "binary"
            return vcat([string(operators.binops[node.op])], tree_to_seq(node.l, operators), tree_to_seq(node.r, operators))
        end
    end
    seq = tree_to_seq(ex.tree, operators)
    println(seq)
end


["0.0"]


In [255]:
test(expressions[1], operators, variable_names, encoding)

["2.519974064e-314"]


In [233]:
max_node_cnt = 5
seq_len = 2 * max_node_cnt + 1  # Worst case: linear binary tree with each node having two children (const + binary) and last binary has two const nodes
operators = OperatorEnum(binary_operators=(+, -, *), unary_operators=(sin, exp))
variable_names = [:x1]

# Encoding for onehot matrix
encoding = [string(op) for op in vcat(collect(operators.binops), collect(operators.unaops), variable_names)]
push!(encoding, "[NUM]")
push!(encoding, "[PAD]")


function str_to_onehot(str::String, operators::OperatorEnum, variable_names::Vector{Symbol}, encoding::Vector{String})
    # String to DynamicExpressions expression (incl tree)
    expr = Meta.parse(str)
    ex = parse_expression(expr, operators=operators, variable_names=variable_names, node_type=GraphNode)

    # Parse tree into normal Polish (prefix) notation of fixed length (incl padding)
    function tree_to_seq(node::GraphNode, operators::OperatorEnum)
        if node.constant
            @info "constant"
            return [string(node.val)]
        elseif node.degree == 0  # Feature
            @info "feature"
            return ["x" * string(node.feature)]
        elseif node.degree == 1  # Unary op
            @info "unary"
            return vcat([string(operators.unaops[node.op])], tree_to_seq(node.l, operators))
        elseif node.degree == 2  # Binary op
            @info "binary"
            return vcat([string(operators.binops[node.op])], tree_to_seq(node.l, operators), tree_to_seq(node.r, operators))
        end
    end
    seq = tree_to_seq(ex.tree, operators)
    println(seq)
    seq = vcat(seq, repeat(["[PAD]"], seq_len - length(seq)))
    println(seq)

    # Parse sequence into onehot matrix with numerical values extracted into extra vector (which is part of matrix)

    # This intermediate step replaces numbers with "[NUM]" and stores the numbers in a separate vector. Redunant but might become beneficial
    seq_num = Vector{Float32}(undef, length(seq))
    for (i, token) in enumerate(seq)
        # Check if token is a number
        if occursin(r"^-?\d+(\.\d+)?$", token)
            seq_num[i] = parse(Float32, token)
            seq[i] = "[NUM]"
        else
            seq_num[i] = 0
        end
    end

    println(seq)

    onehot = zeros(Float32, length(encoding)+1, length(seq))
    for (i, token) in enumerate(seq)
        @assert token in encoding "Token $token not in encoding"
        onehot[findfirst(==(token), encoding), i] = 1
        if token == "[NUM]"
            onehot[end, i] = seq_num[i]
        end
    end
    return onehot
end

println(expressions[1])
onehot = str_to_onehot(expressions[1], operators, variable_names, encoding)
onehot

(x1 - 0.125192826477114) + 0.43174347075339936
["2.519974064e-314"]
["2.519974064e-314", "[PAD]", "[PAD]", "[PAD]", "[PAD]", "[PAD]", "[PAD]", "[PAD]", "[PAD]", "[PAD]", "[PAD]"]
["2.519974064e-314", "[PAD]", "[PAD]", "[PAD]", "[PAD]", "[PAD]", "[PAD]", "[PAD]", "[PAD]", "[PAD]", "[PAD]"]


AssertionError: AssertionError: Token 2.519974064e-314 not in encoding

In [160]:
# Print onehot matrix incl column names
function print_onehot(onehot)
    println(encoding)
    for i in 1:size(onehot, 2)
        println(onehot[:, i])
    end
end
print_onehot(onehot)

["+", "-", "*", "sin", "exp", "x1", "[NUM]", "[PAD]"]
Float32[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
Float32[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
Float32[0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
Float32[0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0]
Float32[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.44960538]
Float32[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.60018736]
Float32[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.8458093]
Float32[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0]
Float32[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0]
Float32[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0]
Float32[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0]


In [141]:
expressions[1:2]

2-element Vector{String}:
 "(x1 - 0.125192826477114) + 0.43174347075339936"
 "sin(exp(sin(x1) * 0.4609526466213322))"

In [180]:
# Apply str_to_onehot to each expression and create one large tensor
function create_dataset(expressions, operators, variable_names, encoding)

    # Initialize the tensor
    dataset = zeros(Float32, length(encoding)+1, seq_len, length(expressions))
    
    # Fill the tensor
    for (i, expr) in enumerate(expressions)
        println(expr)
        dataset[:, :, i] = str_to_onehot(expr, operators, variable_names, encoding)
    end
    
    return dataset
end

# Create the dataset
dataset = create_dataset([expressions[1]], operators, variable_names, encoding)

# Print the shape of the resulting tensor
println("Dataset shape: ", size(dataset))

(x1 - 0.125192826477114) + 0.43174347075339936
["2.225919638e-314", "[PAD]", "[PAD]", "[PAD]", "[PAD]", "[PAD]", "[PAD]", "[PAD]", "[PAD]", "[PAD]", "[PAD]"]


AssertionError: AssertionError: Token 2.225919638e-314 not in encoding

In [14]:
using Logging
using IJulia

# Create a custom logger that writes to the notebook output
notebook_logger = IJulia.ConsoleLogger(IJulia.stdout, Logging.Warn)

# Set the global logger to use the notebook logger
global_logger(notebook_logger)

@info "test"

In [95]:
onehot = zeros(Float32, length(encoding)+1, length(seq))
for (i, token) in enumerate(seq)
    onehot[findfirst(==(token), encoding), i] = 1
    if token == "[NUM]"
        onehot[end, i] = seq_num[i]
    end
end
onehot

7×7 Matrix{Float32}:
 1.0  0.0  0.0  0.0  1.0  0.0  0.0
 0.0  0.0  1.0  0.0  0.0  0.0  0.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0
 0.0  0.0  0.0  0.0  0.0  0.0  1.0
 0.0  1.0  0.0  1.0  0.0  1.0  0.0
 0.0  2.0  0.0  3.0  0.0  5.0  0.0

### Use older SR.jl and DE.jl versions. Go directly from tree generation to onehot without intermediate str step

In [53]:
using DynamicExpressions
using StatsBase: sample, Weights

function create_node((node_type, payload), operators::OperatorEnum)
    if node_type == "constant"
        return Node{Float64}(; val=rand(Float64))  # one(Float64)
    elseif node_type == "feature"
        return Node{Float64}(; feature=payload)
    else
        n = Node{Float64}()
        degree = node_type == "unary" ? 1 : 2
        n.degree = degree
        n.constant = false
        if degree == 1
            @assert(
                payload in eachindex(operators.unaops),
                "Could not find unary operator with index $payload"
            )
        else
            @assert(
                payload in eachindex(operators.binops),
                "Could not find binary operator with index $payload"
            )
        end
        n.op = payload
        return n
    end
end

function create_tree(nodes::AbstractVector{Tuple{String,Int}}, operators::OperatorEnum)::Node
    _create_tree!(copy(nodes), operators)
end

function _create_tree!(nodes::AbstractVector{Tuple{String,Int}}, operators::OperatorEnum)::Node
    if isempty(nodes)
        # Fallback is to simply return 1.0
        return create_node(("constant", 0), operators)
    end
    new_root = create_node(popfirst!(nodes), operators)
    if new_root.degree == 0
        # Nothing to do
    elseif new_root.degree == 1
        new_root.l = _create_tree!(nodes, operators)
    elseif new_root.degree == 2
        new_root.l = _create_tree!(nodes, operators)
        new_root.r = _create_tree!(nodes, operators)
    else
        error()
    end
    return new_root
end

function possibilities(n_features, operators::OperatorEnum)
    possible_nodes = [
        ("binary", i) for i in eachindex(operators.binops)
    ]
    append!(possible_nodes, [
        ("unary", j) for j in eachindex(operators.unaops)
    ])
    append!(possible_nodes, [
        ("feature", k) for k in 1:n_features
    ])
    append!(possible_nodes, [("constant", 0)])
    return possible_nodes
end

possibilities (generic function with 1 method)

In [54]:
operators = OperatorEnum(binary_operators=(+, -, *), unary_operators=(sin, exp))
num_features = 1
node_types = possibilities(num_features, operators)
println(node_types)

weights = Weights([1, 1, 1, 0.5, 0.5, 2, 0.4])
println(weights)

weights_first = Weights([1, 1, 1, 0.5, 0.5, 0.01, 0])
println(weights_first)

[("binary", 1), ("binary", 2), ("binary", 3), ("unary", 1), ("unary", 2), ("feature", 1), ("constant", 0)]
[1.0, 1.0, 1.0, 0.5, 0.5, 2.0, 0.4]
[1.0, 1.0, 1.0, 0.5, 0.5, 0.01, 0.0]


In [55]:
# Sample list of nodes
max_node_cnt = 5
cnt = 100_000

trees = []
for _ in 1:cnt
    # Make sure first node is not a constant
    first_node = sample(node_types, weights_first, 1)
    node_list = [first_node; sample(node_types, weights, max_node_cnt-1)]
    tree = create_tree(node_list, operators)
    # println(tree)
    simplify_tree!(tree, operators)
    # println(tree)
    # println("-----------")
    if tree.degree > 0
        push!(trees, tree)
    end
end

trees = unique(trees)

52568-element Vector{Any}:
 (x1 * 1.5883793909563968) - 0.05018838430294015
 x1 - exp(x1)
 x1 + 1.3273607784897687
 x1 - x1
 x1 * sin(x1 - 0.8478050523738035)
 x1 * sin(x1 * 0.9926585513644764)
 sin(x1)
 x1 * 0.007129792565842735
 x1 - (x1 + x1)
 (x1 + 0.027105777019746712) + 0.8523044948527446
 ⋮
 x1 * (x1 - 0.31330857936116474)
 ((x1 - x1) + 0.5222009241403515) + 0.7279472766667405
 exp(x1 - (x1 + 0.7311724776009595))
 ((x1 - x1) - 0.7308134763855079) * 0.1188677075324851
 ((x1 - 1.8668722946694833) + 0.35752874650552646) + 0.3368278888588794
 ((x1 + 1.0224859217038054) - 0.5168092179685329) + 0.2877963905705141
 x1 + (x1 - 0.47315278983161935)
 (x1 * 0.8200813489194786) * 0.8291529567896694
 (sin(x1) * x1) - 0.17428806088570314

In [56]:
seq_len = 2 * max_node_cnt + 1  # Worst case: linear binary tree with each node having two children (const + binary) and last binary has two const nodes
operators = OperatorEnum(binary_operators=(+, -, *), unary_operators=(sin, exp))
variable_names = [:x1]

# Encoding for onehot matrix
encoding = [string(op) for op in vcat(collect(operators.binops), collect(operators.unaops), variable_names)]
push!(encoding, "[NUM]")
push!(encoding, "[PAD]")


function tree_to_onehot(tree::Node, operators::OperatorEnum, variable_names::Vector{Symbol}, encoding::Vector{String})
    @assert !tree.constant "Tree is constant. Tree: $tree"

    # Parse tree into normal Polish (prefix) notation of fixed length (incl padding)
    function tree_to_seq(node::Node, operators::OperatorEnum)
        if node.constant
            @info "constant"
            return [string("~", node.val)]
        elseif node.degree == 0  # Feature
            @info "feature"
            return ["x" * string(node.feature)]
        elseif node.degree == 1  # Unary op
            @info "unary"
            return vcat([string(operators.unaops[node.op])], tree_to_seq(node.l, operators))
        elseif node.degree == 2  # Binary op
            @info "binary"
            return vcat([string(operators.binops[node.op])], tree_to_seq(node.l, operators), tree_to_seq(node.r, operators))
        end
    end
    seq = tree_to_seq(tree, operators)
    # println(seq)
    seq = vcat(seq, repeat(["[PAD]"], seq_len - length(seq)))
    # println(seq)

    # Parse sequence into onehot matrix with numerical values extracted into extra vector (which is part of matrix)

    # This intermediate step replaces numbers with "[NUM]" and stores the numbers in a separate vector. Redunant but might become beneficial
    seq_num = Vector{Float32}(undef, length(seq))
    for (i, token) in enumerate(seq)
        # Check if token is a number
        if startswith(token, "~")
            seq_num[i] = parse(Float32, token[2:end])
            seq[i] = "[NUM]"
        else
            seq_num[i] = 0
        end
    end

    # println(seq)

    onehot = zeros(Float32, length(encoding)+1, length(seq))
    for (i, token) in enumerate(seq)
        @assert token in encoding "Token $token not in encoding. Sequence: $seq. Tree: $tree. Const?: $(tree.constant)"
        onehot[findfirst(==(token), encoding), i] = 1
        if token == "[NUM]"
            onehot[end, i] = seq_num[i]
        end
    end
    return onehot
end

println(trees[1])
onehot = tree_to_onehot(trees[1], operators, variable_names, encoding)
onehot

(x1 * 1.5883793909563968) - 0.05018838430294015


9×11 Matrix{Float32}:
 0.0  0.0  0.0  0.0      0.0        0.0  0.0  0.0  0.0  0.0  0.0
 1.0  0.0  0.0  0.0      0.0        0.0  0.0  0.0  0.0  0.0  0.0
 0.0  1.0  0.0  0.0      0.0        0.0  0.0  0.0  0.0  0.0  0.0
 0.0  0.0  0.0  0.0      0.0        0.0  0.0  0.0  0.0  0.0  0.0
 0.0  0.0  0.0  0.0      0.0        0.0  0.0  0.0  0.0  0.0  0.0
 0.0  0.0  1.0  0.0      0.0        0.0  0.0  0.0  0.0  0.0  0.0
 0.0  0.0  0.0  1.0      1.0        0.0  0.0  0.0  0.0  0.0  0.0
 0.0  0.0  0.0  0.0      0.0        1.0  1.0  1.0  1.0  1.0  1.0
 0.0  0.0  0.0  1.58838  0.0501884  0.0  0.0  0.0  0.0  0.0  0.0

In [57]:

function create_dataset(trees, operators, variable_names, encoding)
    dataset = zeros(Float32, length(encoding)+1, seq_len, length(trees))
    for (i, tree) in enumerate(trees)
        # println(tree)
        # println(i)
        dataset[:, :, i] = tree_to_onehot(tree, operators, variable_names, encoding)
    end
    return dataset
end

# Create the dataset
dataset = create_dataset(trees, operators, variable_names, encoding)

# Print the shape of the resulting tensor
println("Dataset shape: ", size(dataset))

AssertionError: AssertionError: Token 2.1837942978386593e-6 not in encoding. Sequence: ["exp", "-", "[NUM]", "-", "x1", "2.1837942978386593e-6", "[PAD]", "[PAD]", "[PAD]", "[PAD]", "[PAD]"]. Tree: exp(0.6079614079139909 - (x1 - 2.1837942978386593e-6)). Const?: false

In [52]:
trees[3075].constant

true