Merge pull request #276 from MilesCranmer/custom-objectives

Allow user to specify full objective functions
MilesCranmer · Mar 25, 2023 · 135e2ff · 135e2ff
2 parents 5abd46e + 2fa6a85
commit 135e2ff
Show file tree

Hide file tree

Showing 4 changed files with 168 additions and 9 deletions.
diff --git a/docs/examples.md b/docs/examples.md
@@ -318,7 +318,124 @@ model.predict(X, -1)
 
 to make predictions with the most accurate expression.
 
-## 9. Additional features
+## 9. Custom objectives
+
+You can also pass a custom objectives as a snippet of Julia code,
+which might include symbolic manipulations or custom functional forms.
+These do not even need to be differentiable! First, let's look at the
+default objective used (a simplified version, without weights
+and with mean square error), so that you can see how to write your own:
+
+```julia
+function default_objective(tree, dataset::Dataset{T,L}, options)::L where {T,L}
+    (prediction, completion) = eval_tree_array(tree, dataset.X, options)
+    if !completion
+        return L(Inf)
+    end
+
+    diffs = prediction .- dataset.y
+
+    return sum(diffs .^ 2) / length(diffs)
+end
+```
+
+Here, the `where {T,L}` syntax defines the function for arbitrary types `T` and `L`.
+If you have `precision=32` (default) and pass in regular floating point data,
+then both `T` and `L` will be equal to `Float32`. If you pass in complex data,
+then `T` will be `ComplexF32` and `L` will be `Float32` (since we need to return
+a real number from the loss function). But, you don't need to worry about this, just
+make sure to return a scalar number of type `L`.
+
+The `tree` argument is the current expression being evaluated. You can read
+about the `tree` fields [here](https://astroautomata.com/SymbolicRegression.jl/stable/types/).
+
+For example, let's fix a symbolic form of an expression,
+as a rational function. i.e., $P(X)/Q(X)$ for polynomials $P$ and $Q$.
+
+```python
+objective = """
+function my_custom_objective(tree, dataset::Dataset{T,L}, options) where {T,L}
+    # Require root node to be binary, so we can split it,
+    # otherwise return a large loss:
+    tree.degree != 2 && return L(Inf)
+
+    P = tree.l
+    Q = tree.r
+
+    # Evaluate numerator:
+    P_prediction, flag = eval_tree_array(P, dataset.X, options)
+    !flag && return L(Inf)
+
+    # Evaluate denominator:
+    Q_prediction, flag = eval_tree_array(Q, dataset.X, options)
+    !flag && return L(Inf)
+
+    # Impose functional form:
+    prediction = P_prediction ./ Q_prediction
+
+    diffs = prediction .- dataset.y
+
+    return sum(diffs .^ 2) / length(diffs)
+end
+"""
+
+model = PySRRegressor(
+    niterations=100,
+    binary_operators=["*", "+", "-"],
+    full_objective=objective,
+)
+```
+
+> **Warning**: When using a custom objective like this that performs symbolic
+> manipulations, many functionalities of PySR will not work, such as `.sympy()`,
+> `.predict()`, etc. This is because the SymPy parsing does not know about
+> how you are manipulating the expression, so you will need to do this yourself.
+
+Note how we did not pass `/` as a binary operator; it will just be implicit
+in the functional form.
+
+Let's generate an equation of the form $\frac{x_0^2 x_1 - 2}{x_2^2 + 1}$:
+
+```python
+X = np.random.randn(1000, 3)
+y = (X[:, 0]**2 * X[:, 1] - 2) / (X[:, 2]**2 + 1)
+```
+
+Finally, let's fit:
+
+```python
+model.fit(X, y)
+```
+
+> Note that the printed equation is not the same as the evaluated equation,
+> because the printing functionality does not know about the functional form.
+
+We can get the string format with:
+
+```python
+model.get_best().equation
+```
+
+(or, you could use `model.equations_.iloc[-1].equation`)
+
+For me, this equation was:
+
+```text
+(((2.3554819 + -0.3554746) - (x1 * (x0 * x0))) - (-1.0000019 - (x2 * x2)))
+```
+
+looking at the bracket structure of the equation, we can see that the outermost
+bracket is split at the `-` operator (note that we ignore the root operator in
+the evaluation, as we simply evaluated each argument and divided the result) into
+`((2.3554819 + -0.3554746) - (x1 * (x0 * x0)))` and
+`(-1.0000019 - (x2 * x2))`, meaning that our discovered equation is
+equal to:
+$\frac{x_0^2 x_1 - 2.0000073}{x_2^2 - 1.0000019}$, which
+is nearly the same as the true equation!
+
+
+
+## 10. Additional features
 
 For the many other features available in PySR, please
 read the [Options section](options.md).
diff --git a/docs/param_groupings.yml b/docs/param_groupings.yml
@@ -11,6 +11,7 @@
     - ncyclesperiteration
   - The Objective:
     - loss
+    - full_objective
     - model_selection
   - Working with Complexities:
     - parsimony

diff --git a/pysr/sr.py b/pysr/sr.py
@@ -320,9 +320,9 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
         argument is constrained.
         Default is `None`.
     loss : str
-        String of Julia code specifying the loss function. Can either
-        be a loss from LossFunctions.jl, or your own loss written as a
-        function. Examples of custom written losses include:
+        String of Julia code specifying an elementwise loss function.
+        Can either be a loss from LossFunctions.jl, or your own loss
+        written as a function. Examples of custom written losses include:
         `myloss(x, y) = abs(x-y)` for non-weighted, or
         `myloss(x, y, w) = w*abs(x-y)` for weighted.
         The included losses include:
@@ -335,6 +335,26 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
         `ModifiedHuberLoss()`, `L2MarginLoss()`, `ExpLoss()`,
         `SigmoidLoss()`, `DWDMarginLoss(q)`.
         Default is `"L2DistLoss()"`.
+    full_objective : str
+        Alternatively, you can specify the full objective function as
+        a snippet of Julia code, including any sort of custom evaluation
+        (including symbolic manipulations beforehand), and any sort
+        of loss function or regularizations. The default `full_objective`
+        used in SymbolicRegression.jl is roughly equal to:
+        ```julia
+        function eval_loss(tree, dataset::Dataset{T}, options)::T where T
+            prediction, flag = eval_tree_array(tree, dataset.X, options)
+            if !flag
+                return T(Inf)
+            end
+            sum((prediction .- dataset.y) .^ 2) / dataset.n
+        end
+        ```
+        where the example elementwise loss is mean-squared error.
+        You may pass a function with the same arguments as this (note
+        that the name of the function doesn't matter). Here,
+        both `prediction` and `dataset.y` are 1D arrays of length `dataset.n`.
+        Default is `None`.
     complexity_of_operators : dict[str, float]
         If you would like to use a complexity other than 1 for an
         operator, specify the complexity here. For example,
@@ -681,7 +701,8 @@ def __init__(
         timeout_in_seconds=None,
         constraints=None,
         nested_constraints=None,
-        loss="L2DistLoss()",
+        loss=None,
+        full_objective=None,
         complexity_of_operators=None,
         complexity_of_constants=1,
         complexity_of_variables=1,
@@ -770,6 +791,7 @@ def __init__(
         self.early_stop_condition = early_stop_condition
         # - Loss parameters
         self.loss = loss
+        self.full_objective = full_objective
         self.complexity_of_operators = complexity_of_operators
         self.complexity_of_constants = complexity_of_constants
         self.complexity_of_variables = complexity_of_variables
@@ -1224,6 +1246,9 @@ def _validate_and_set_init_params(self):
                 "to True and `procs` to 0 will result in non-deterministic searches. "
             )
 
+        if self.loss is not None and self.full_objective is not None:
+            raise ValueError("You cannot set both `loss` and `full_objective`.")
+
         # NotImplementedError - Values that could be supported at a later time
         if self.optimizer_algorithm not in VALID_OPTIMIZER_ALGORITHMS:
             raise NotImplementedError(
@@ -1553,6 +1578,8 @@ def _run(self, X, y, mutated_params, weights, seed):
             complexity_of_operators = Main.eval(complexity_of_operators_str)
 
         custom_loss = Main.eval(self.loss)
+        custom_full_objective = Main.eval(self.full_objective)
+
         early_stop_condition = Main.eval(
             str(self.early_stop_condition) if self.early_stop_condition else None
         )
@@ -1581,6 +1608,7 @@ def _run(self, X, y, mutated_params, weights, seed):
             complexity_of_variables=self.complexity_of_variables,
             nested_constraints=nested_constraints,
             elementwise_loss=custom_loss,
+            loss_function=custom_full_objective,
             maxsize=int(self.maxsize),
             output_file=_escape_filename(self.equation_file_),
             npopulations=int(self.populations),

diff --git a/pysr/test/test.py b/pysr/test/test.py
@@ -72,26 +72,39 @@ def test_linear_relation_weighted(self):
         print(model.equations_)
         self.assertLessEqual(model.get_best()["loss"], 1e-4)
 
-    def test_multiprocessing_turbo(self):
+    def test_multiprocessing_turbo_custom_objective(self):
+        rstate = np.random.RandomState(0)
         y = self.X[:, 0]
+        y += rstate.randn(*y.shape) * 1e-4
         model = PySRRegressor(
             **self.default_test_kwargs,
             # Turbo needs to work with unsafe operators:
             unary_operators=["sqrt"],
             procs=2,
             multithreading=False,
             turbo=True,
-            early_stop_condition="stop_if(loss, complexity) = loss < 1e-4 && complexity == 1",
+            early_stop_condition="stop_if(loss, complexity) = loss < 1e-10 && complexity == 1",
+            full_objective="""
+            function my_objective(tree::Node{T}, dataset::Dataset{T}, options::Options) where T
+                prediction, flag = eval_tree_array(tree, dataset.X, options)
+                !flag && return T(Inf)
+                abs3(x) = abs(x) ^ 3
+                return sum(abs3, prediction .- dataset.y) / length(prediction)
+            end
+            """,
         )
         model.fit(self.X, y)
         print(model.equations_)
-        self.assertLessEqual(model.equations_.iloc[-1]["loss"], 1e-4)
+        best_loss = model.equations_.iloc[-1]["loss"]
+        self.assertLessEqual(best_loss, 1e-10)
+        self.assertGreaterEqual(best_loss, 0.0)
 
-    def test_high_precision_search(self):
+    def test_high_precision_search_custom_loss(self):
         y = 1.23456789 * self.X[:, 0]
         model = PySRRegressor(
             **self.default_test_kwargs,
             early_stop_condition="stop_if(loss, complexity) = loss < 1e-4 && complexity == 3",
+            loss="my_loss(prediction, target) = (prediction - target)^2",
             precision=64,
             parsimony=0.01,
             warm_start=True,