diff --git a/docs/examples.md b/docs/examples.md index ddec4b301..fae1ca42f 100644 --- a/docs/examples.md +++ b/docs/examples.md @@ -318,7 +318,124 @@ model.predict(X, -1) to make predictions with the most accurate expression. -## 9. Additional features +## 9. Custom objectives + +You can also pass a custom objectives as a snippet of Julia code, +which might include symbolic manipulations or custom functional forms. +These do not even need to be differentiable! First, let's look at the +default objective used (a simplified version, without weights +and with mean square error), so that you can see how to write your own: + +```julia +function default_objective(tree, dataset::Dataset{T,L}, options)::L where {T,L} + (prediction, completion) = eval_tree_array(tree, dataset.X, options) + if !completion + return L(Inf) + end + + diffs = prediction .- dataset.y + + return sum(diffs .^ 2) / length(diffs) +end +``` + +Here, the `where {T,L}` syntax defines the function for arbitrary types `T` and `L`. +If you have `precision=32` (default) and pass in regular floating point data, +then both `T` and `L` will be equal to `Float32`. If you pass in complex data, +then `T` will be `ComplexF32` and `L` will be `Float32` (since we need to return +a real number from the loss function). But, you don't need to worry about this, just +make sure to return a scalar number of type `L`. + +The `tree` argument is the current expression being evaluated. You can read +about the `tree` fields [here](https://astroautomata.com/SymbolicRegression.jl/stable/types/). + +For example, let's fix a symbolic form of an expression, +as a rational function. i.e., $P(X)/Q(X)$ for polynomials $P$ and $Q$. + +```python +objective = """ +function my_custom_objective(tree, dataset::Dataset{T,L}, options) where {T,L} + # Require root node to be binary, so we can split it, + # otherwise return a large loss: + tree.degree != 2 && return L(Inf) + + P = tree.l + Q = tree.r + + # Evaluate numerator: + P_prediction, flag = eval_tree_array(P, dataset.X, options) + !flag && return L(Inf) + + # Evaluate denominator: + Q_prediction, flag = eval_tree_array(Q, dataset.X, options) + !flag && return L(Inf) + + # Impose functional form: + prediction = P_prediction ./ Q_prediction + + diffs = prediction .- dataset.y + + return sum(diffs .^ 2) / length(diffs) +end +""" + +model = PySRRegressor( + niterations=100, + binary_operators=["*", "+", "-"], + full_objective=objective, +) +``` + +> **Warning**: When using a custom objective like this that performs symbolic +> manipulations, many functionalities of PySR will not work, such as `.sympy()`, +> `.predict()`, etc. This is because the SymPy parsing does not know about +> how you are manipulating the expression, so you will need to do this yourself. + +Note how we did not pass `/` as a binary operator; it will just be implicit +in the functional form. + +Let's generate an equation of the form $\frac{x_0^2 x_1 - 2}{x_2^2 + 1}$: + +```python +X = np.random.randn(1000, 3) +y = (X[:, 0]**2 * X[:, 1] - 2) / (X[:, 2]**2 + 1) +``` + +Finally, let's fit: + +```python +model.fit(X, y) +``` + +> Note that the printed equation is not the same as the evaluated equation, +> because the printing functionality does not know about the functional form. + +We can get the string format with: + +```python +model.get_best().equation +``` + +(or, you could use `model.equations_.iloc[-1].equation`) + +For me, this equation was: + +```text +(((2.3554819 + -0.3554746) - (x1 * (x0 * x0))) - (-1.0000019 - (x2 * x2))) +``` + +looking at the bracket structure of the equation, we can see that the outermost +bracket is split at the `-` operator (note that we ignore the root operator in +the evaluation, as we simply evaluated each argument and divided the result) into +`((2.3554819 + -0.3554746) - (x1 * (x0 * x0)))` and +`(-1.0000019 - (x2 * x2))`, meaning that our discovered equation is +equal to: +$\frac{x_0^2 x_1 - 2.0000073}{x_2^2 - 1.0000019}$, which +is nearly the same as the true equation! + + + +## 10. Additional features For the many other features available in PySR, please read the [Options section](options.md). diff --git a/docs/param_groupings.yml b/docs/param_groupings.yml index 62ca50011..b608a24de 100644 --- a/docs/param_groupings.yml +++ b/docs/param_groupings.yml @@ -11,6 +11,7 @@ - ncyclesperiteration - The Objective: - loss + - full_objective - model_selection - Working with Complexities: - parsimony diff --git a/pysr/sr.py b/pysr/sr.py index 1b31ca14b..074cf6647 100644 --- a/pysr/sr.py +++ b/pysr/sr.py @@ -320,9 +320,9 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator): argument is constrained. Default is `None`. loss : str - String of Julia code specifying the loss function. Can either - be a loss from LossFunctions.jl, or your own loss written as a - function. Examples of custom written losses include: + String of Julia code specifying an elementwise loss function. + Can either be a loss from LossFunctions.jl, or your own loss + written as a function. Examples of custom written losses include: `myloss(x, y) = abs(x-y)` for non-weighted, or `myloss(x, y, w) = w*abs(x-y)` for weighted. The included losses include: @@ -335,6 +335,26 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator): `ModifiedHuberLoss()`, `L2MarginLoss()`, `ExpLoss()`, `SigmoidLoss()`, `DWDMarginLoss(q)`. Default is `"L2DistLoss()"`. + full_objective : str + Alternatively, you can specify the full objective function as + a snippet of Julia code, including any sort of custom evaluation + (including symbolic manipulations beforehand), and any sort + of loss function or regularizations. The default `full_objective` + used in SymbolicRegression.jl is roughly equal to: + ```julia + function eval_loss(tree, dataset::Dataset{T}, options)::T where T + prediction, flag = eval_tree_array(tree, dataset.X, options) + if !flag + return T(Inf) + end + sum((prediction .- dataset.y) .^ 2) / dataset.n + end + ``` + where the example elementwise loss is mean-squared error. + You may pass a function with the same arguments as this (note + that the name of the function doesn't matter). Here, + both `prediction` and `dataset.y` are 1D arrays of length `dataset.n`. + Default is `None`. complexity_of_operators : dict[str, float] If you would like to use a complexity other than 1 for an operator, specify the complexity here. For example, @@ -681,7 +701,8 @@ def __init__( timeout_in_seconds=None, constraints=None, nested_constraints=None, - loss="L2DistLoss()", + loss=None, + full_objective=None, complexity_of_operators=None, complexity_of_constants=1, complexity_of_variables=1, @@ -770,6 +791,7 @@ def __init__( self.early_stop_condition = early_stop_condition # - Loss parameters self.loss = loss + self.full_objective = full_objective self.complexity_of_operators = complexity_of_operators self.complexity_of_constants = complexity_of_constants self.complexity_of_variables = complexity_of_variables @@ -1224,6 +1246,9 @@ def _validate_and_set_init_params(self): "to True and `procs` to 0 will result in non-deterministic searches. " ) + if self.loss is not None and self.full_objective is not None: + raise ValueError("You cannot set both `loss` and `full_objective`.") + # NotImplementedError - Values that could be supported at a later time if self.optimizer_algorithm not in VALID_OPTIMIZER_ALGORITHMS: raise NotImplementedError( @@ -1553,6 +1578,8 @@ def _run(self, X, y, mutated_params, weights, seed): complexity_of_operators = Main.eval(complexity_of_operators_str) custom_loss = Main.eval(self.loss) + custom_full_objective = Main.eval(self.full_objective) + early_stop_condition = Main.eval( str(self.early_stop_condition) if self.early_stop_condition else None ) @@ -1581,6 +1608,7 @@ def _run(self, X, y, mutated_params, weights, seed): complexity_of_variables=self.complexity_of_variables, nested_constraints=nested_constraints, elementwise_loss=custom_loss, + loss_function=custom_full_objective, maxsize=int(self.maxsize), output_file=_escape_filename(self.equation_file_), npopulations=int(self.populations), diff --git a/pysr/test/test.py b/pysr/test/test.py index f3e7495ba..d1972c5e9 100644 --- a/pysr/test/test.py +++ b/pysr/test/test.py @@ -72,8 +72,10 @@ def test_linear_relation_weighted(self): print(model.equations_) self.assertLessEqual(model.get_best()["loss"], 1e-4) - def test_multiprocessing_turbo(self): + def test_multiprocessing_turbo_custom_objective(self): + rstate = np.random.RandomState(0) y = self.X[:, 0] + y += rstate.randn(*y.shape) * 1e-4 model = PySRRegressor( **self.default_test_kwargs, # Turbo needs to work with unsafe operators: @@ -81,17 +83,28 @@ def test_multiprocessing_turbo(self): procs=2, multithreading=False, turbo=True, - early_stop_condition="stop_if(loss, complexity) = loss < 1e-4 && complexity == 1", + early_stop_condition="stop_if(loss, complexity) = loss < 1e-10 && complexity == 1", + full_objective=""" + function my_objective(tree::Node{T}, dataset::Dataset{T}, options::Options) where T + prediction, flag = eval_tree_array(tree, dataset.X, options) + !flag && return T(Inf) + abs3(x) = abs(x) ^ 3 + return sum(abs3, prediction .- dataset.y) / length(prediction) + end + """, ) model.fit(self.X, y) print(model.equations_) - self.assertLessEqual(model.equations_.iloc[-1]["loss"], 1e-4) + best_loss = model.equations_.iloc[-1]["loss"] + self.assertLessEqual(best_loss, 1e-10) + self.assertGreaterEqual(best_loss, 0.0) - def test_high_precision_search(self): + def test_high_precision_search_custom_loss(self): y = 1.23456789 * self.X[:, 0] model = PySRRegressor( **self.default_test_kwargs, early_stop_condition="stop_if(loss, complexity) = loss < 1e-4 && complexity == 3", + loss="my_loss(prediction, target) = (prediction - target)^2", precision=64, parsimony=0.01, warm_start=True,