From 18c1f9cc18ec32e1c5e5f1d61d6a21be8b231877 Mon Sep 17 00:00:00 2001 From: Miles Cranmer Date: Tue, 12 Jul 2022 20:12:14 -0400 Subject: [PATCH 01/40] Center formatting for README --- README.md | 22 ++++++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 9d2e72e27..a3630781c 100644 --- a/README.md +++ b/README.md @@ -1,15 +1,24 @@ [//]: # (Logo:) - +
+ + # PySR: High-Performance Symbolic Regression in Python +
+ + PySR is built on an extremely optimized pure-Julia backend, and uses regularized evolution, simulated annealing, and gradient-free optimization to search for equations that fit your data. +
+ | **Docs** | **colab** | **pip** | **conda** | **Stats** | |---|---|---|---|---| |[![Documentation](https://github.com/MilesCranmer/PySR/actions/workflows/docs.yml/badge.svg)](https://astroautomata.com/PySR/)|[![Colab](https://img.shields.io/badge/colab-notebook-yellow)](https://colab.research.google.com/github/MilesCranmer/PySR/blob/master/examples/pysr_demo.ipynb)|[![PyPI version](https://badge.fury.io/py/pysr.svg)](https://badge.fury.io/py/pysr)|[![Conda Version](https://img.shields.io/conda/vn/conda-forge/pysr.svg)](https://anaconda.org/conda-forge/pysr)|[![Downloads](https://pepy.tech/badge/pysr)](https://badge.fury.io/py/pysr)| +
+ (pronounced like *py* as in python, and then *sur* as in surface) @@ -17,7 +26,10 @@ If you find PySR useful, please cite it using the citation information given in If you've finished a project with PySR, please submit a PR to showcase your work on the [Research Showcase page](https://astroautomata.com/PySR/#/papers)! -### Test status: +
+ +### Test status + | **Linux** | **Windows** | **macOS (intel)** | |---|---|---| |[![Linux](https://github.com/MilesCranmer/PySR/actions/workflows/CI.yml/badge.svg)](https://github.com/MilesCranmer/PySR/actions/workflows/CI.yml)|[![Windows](https://github.com/MilesCranmer/PySR/actions/workflows/CI_Windows.yml/badge.svg)](https://github.com/MilesCranmer/PySR/actions/workflows/CI_Windows.yml)|[![macOS](https://github.com/MilesCranmer/PySR/actions/workflows/CI_mac.yml/badge.svg)](https://github.com/MilesCranmer/PySR/actions/workflows/CI_mac.yml)| @@ -25,6 +37,8 @@ If you've finished a project with PySR, please submit a PR to showcase your work |[![Docker](https://github.com/MilesCranmer/PySR/actions/workflows/CI_docker.yml/badge.svg)](https://github.com/MilesCranmer/PySR/actions/workflows/CI_docker.yml)|[![conda-forge](https://github.com/MilesCranmer/PySR/actions/workflows/CI_conda_forge.yml/badge.svg)](https://github.com/MilesCranmer/PySR/actions/workflows/CI_conda_forge.yml)|[![Coverage Status](https://coveralls.io/repos/github/MilesCranmer/PySR/badge.svg?branch=master&service=github)](https://coveralls.io/github/MilesCranmer/PySR)| +
+ Check out [SymbolicRegression.jl](https://github.com/MilesCranmer/SymbolicRegression.jl) for the pure-Julia backend of this package. @@ -58,10 +72,14 @@ python interface. # Installation +
+ | pip (macOS, Linux, Windows) | conda (macOS - only Intel, Linux) | |---|---| | 1. Install Julia manually (see [downloads](https://julialang.org/downloads/))
2. `pip install pysr`
3. `python -c 'import pysr; pysr.install()'` | 1. `conda install -c conda-forge pysr`
2. `python -c 'import pysr; pysr.install()'`| +
+ This last step will install and update the required Julia packages, including `PyCall.jl`. From ba3d14088548e5f3a747e090ffec6df4bb2cf602 Mon Sep 17 00:00:00 2001 From: Saurav Maheshkar Date: Mon, 18 Jul 2022 01:28:30 +0530 Subject: [PATCH 02/40] feat(docker): Add opencontainers image-spec to `Dockerfile` MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This PR makes a few tiny changes to improve the overall quality of the docker image 🐳 . For reference more annotations can be found [here](https://github.com/opencontainers/image-spec/blob/main/annotations.md) --- Dockerfile | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/Dockerfile b/Dockerfile index e3382f3bd..216b2885a 100644 --- a/Dockerfile +++ b/Dockerfile @@ -6,6 +6,12 @@ ARG VERSION=latest FROM --platform=$ARCH julia:$VERSION +# metainformation +LABEL org.opencontainers.image.version = "0.9.0" +LABEL org.opencontainers.image.authors = "Miles Cranmer" +LABEL org.opencontainers.image.source = "https://github.com/MilesCranmer/PySR" +LABEL org.opencontainers.image.licenses = "Apache License 2.0" + # Need to use ARG after FROM, otherwise it won't get passed through. ARG PYVERSION=3.9.10 From 36e49b9581dc6c792e84cb3a1171b0d52fd38864 Mon Sep 17 00:00:00 2001 From: Saurav Maheshkar Date: Mon, 18 Jul 2022 02:09:15 +0530 Subject: [PATCH 03/40] feat: add `ARG` for package version --- Dockerfile | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/Dockerfile b/Dockerfile index 216b2885a..e38351b41 100644 --- a/Dockerfile +++ b/Dockerfile @@ -3,11 +3,12 @@ ARG ARCH=linux/amd64 ARG VERSION=latest +ARG PKGVERSION=0.9.0 FROM --platform=$ARCH julia:$VERSION # metainformation -LABEL org.opencontainers.image.version = "0.9.0" +LABEL org.opencontainers.image.version = $PKGVERSION LABEL org.opencontainers.image.authors = "Miles Cranmer" LABEL org.opencontainers.image.source = "https://github.com/MilesCranmer/PySR" LABEL org.opencontainers.image.licenses = "Apache License 2.0" From 7fa5f44ea9c0d3be65b9022b24e0b07b47e121a7 Mon Sep 17 00:00:00 2001 From: Miles Cranmer Date: Sun, 17 Jul 2022 17:02:25 -0400 Subject: [PATCH 04/40] Update Dockerfile --- Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Dockerfile b/Dockerfile index e38351b41..9a45c39db 100644 --- a/Dockerfile +++ b/Dockerfile @@ -3,7 +3,7 @@ ARG ARCH=linux/amd64 ARG VERSION=latest -ARG PKGVERSION=0.9.0 +ARG PKGVERSION=0.9.5 FROM --platform=$ARCH julia:$VERSION From ccf71e9b9eb54f0bdb5cca88414cfb34639ec7f8 Mon Sep 17 00:00:00 2001 From: MilesCranmer Date: Wed, 20 Jul 2022 14:28:04 -0400 Subject: [PATCH 05/40] `load` function to init model from saved equations --- pysr/__init__.py | 1 + pysr/sr.py | 74 ++++++++++++++++++++++++++++++++++++++++++++++++ test/test.py | 28 +++++++++++++++++- 3 files changed, 102 insertions(+), 1 deletion(-) diff --git a/pysr/__init__.py b/pysr/__init__.py index e303becb2..210e85cb7 100644 --- a/pysr/__init__.py +++ b/pysr/__init__.py @@ -6,6 +6,7 @@ best_tex, best_callable, best_row, + load, ) from .julia_helpers import install from .feynman_problems import Problem, FeynmanProblem diff --git a/pysr/sr.py b/pysr/sr.py index 3e2112975..ec12fe877 100644 --- a/pysr/sr.py +++ b/pysr/sr.py @@ -2034,3 +2034,77 @@ def run_feature_selection(X, y, select_k_features, random_state=None): clf, threshold=-np.inf, max_features=select_k_features, prefit=True ) return selector.get_support(indices=True) + + +def load( + equation_file, + *, + binary_operators, + unary_operators, + n_features_in, + feature_names_in=None, + selection_mask=None, + nout=1, + **pysr_kwargs, +): + """ + Create a model from equations stored as a csv file + + Parameters + ---------- + equation_file : str + Path to a csv file containing equations. + + binary_operators : list[str], default=["+", "-", "*", "/"] + The same binary operators used when creating the model. + + unary_operators : list[str], default=None + The same unary operators used when creating the model. + + n_features_in : int + Number of features passed to the model. + + feature_names_in : list[str], default=None + Names of the features passed to the model. + + selection_mask : list[bool], default=None + If using select_k_features, you must pass `model.selection_mask_` here. + + nout : int, default=1 + Number of outputs of the model. + + pysr_kwargs : dict + Any other keyword arguments to initialize the PySRRegressor object. + + Returns + ------- + model : PySRRegressor + The model with fitted equations. + """ + + # TODO: copy .bkup file if exists. + model = PySRRegressor( + equation_file=equation_file, + binary_operators=binary_operators, + unary_operators=unary_operators, + **pysr_kwargs, + ) + + model.equation_file_ = equation_file + model.nout_ = nout + model.n_features_in_ = n_features_in + + if feature_names_in is None: + model.feature_names_in_ = [f"x{i}" for i in range(n_features_in)] + else: + assert len(feature_names_in) == n_features_in + model.feature_names_in_ = feature_names_in + + if selection_mask is None: + model.selection_mask_ = np.ones(n_features_in, dtype=bool) + else: + model.selection_mask_ = selection_mask + + model.refresh() + + return model diff --git a/test/test.py b/test/test.py index 4c82a17e1..1c581b7fe 100644 --- a/test/test.py +++ b/test/test.py @@ -4,7 +4,7 @@ import unittest import numpy as np from sklearn import model_selection -from pysr import PySRRegressor +from pysr import PySRRegressor, load from pysr.sr import run_feature_selection, _handle_feature_selection from sklearn.utils.estimator_checks import check_estimator import sympy @@ -280,6 +280,32 @@ def test_high_dim_selection_early_stop(self): model.fit(X.values, y.values, Xresampled=Xresampled.values) self.assertLess(np.average((model.predict(X.values) - y.values) ** 2), 1e-4) + def test_load_model(self): + """See if we can load a ran model from the equation file.""" + csv_file_data = """ + Complexity|MSE|Equation + 1|0.19951081|1.9762075 + 3|0.12717344|(f0 + 1.4724599) + 4|0.104823045|pow_abs(2.2683423, cos(f3))""" + # Strip the indents: + csv_file_data = "\n".join([l.strip() for l in csv_file_data.split("\n")]) + with open("equation_file.csv", "w") as f: + f.write(csv_file_data) + with open("equation_file.csv.bkup", "w") as f: + f.write(csv_file_data) + model = load( + "equation_file.csv", + n_features_in=5, + feature_names_in=["f0", "f1", "f2", "f3", "f4"], + binary_operators=["+", "*", "/", "-", "^"], + unary_operators=["cos"], + ) + X = self.rstate.rand(100, 5) + y_truth = 2.2683423 ** np.cos(X[:, 3]) + y_test = model.predict(X, 2) + + np.testing.assert_allclose(y_truth, y_test) + class TestBest(unittest.TestCase): def setUp(self): From e5b4869851bd5a5f7b6fa783cf6d0f8ff10ca8a5 Mon Sep 17 00:00:00 2001 From: MilesCranmer Date: Wed, 20 Jul 2022 14:32:27 -0400 Subject: [PATCH 06/40] Call `refresh` in load function --- pysr/sr.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pysr/sr.py b/pysr/sr.py index ec12fe877..07b34cd31 100644 --- a/pysr/sr.py +++ b/pysr/sr.py @@ -2090,7 +2090,6 @@ def load( **pysr_kwargs, ) - model.equation_file_ = equation_file model.nout_ = nout model.n_features_in_ = n_features_in @@ -2105,6 +2104,6 @@ def load( else: model.selection_mask_ = selection_mask - model.refresh() + model.refresh(checkpoint_file=equation_file) return model From 179fef6351ee7d5356ec2a1ce9efcda8241dd935 Mon Sep 17 00:00:00 2001 From: MilesCranmer Date: Sun, 24 Jul 2022 22:42:59 -0400 Subject: [PATCH 07/40] Correctly set path names --- test/test.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/test/test.py b/test/test.py index 1c581b7fe..59c7d76bc 100644 --- a/test/test.py +++ b/test/test.py @@ -12,6 +12,7 @@ import warnings import pickle as pkl import tempfile +from pathlib import Path DEFAULT_PARAMS = inspect.signature(PySRRegressor.__init__).parameters DEFAULT_NITERATIONS = DEFAULT_PARAMS["niterations"].default @@ -289,12 +290,14 @@ def test_load_model(self): 4|0.104823045|pow_abs(2.2683423, cos(f3))""" # Strip the indents: csv_file_data = "\n".join([l.strip() for l in csv_file_data.split("\n")]) - with open("equation_file.csv", "w") as f: + rand_dir = Path(tempfile.mkdtemp()) + equation_filename = rand_dir / "equation.csv" + with open(equation_filename, "w") as f: f.write(csv_file_data) - with open("equation_file.csv.bkup", "w") as f: + with open(equation_filename + ".bkup", "w") as f: f.write(csv_file_data) model = load( - "equation_file.csv", + equation_filename, n_features_in=5, feature_names_in=["f0", "f1", "f2", "f3", "f4"], binary_operators=["+", "*", "/", "-", "^"], From 85371bb899ddc546f448adec34e5b2977f080f9d Mon Sep 17 00:00:00 2001 From: MilesCranmer Date: Sun, 24 Jul 2022 22:44:41 -0400 Subject: [PATCH 08/40] Allow pickling without equations_ stored --- pysr/sr.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pysr/sr.py b/pysr/sr.py index 07b34cd31..67c11f58c 100644 --- a/pysr/sr.py +++ b/pysr/sr.py @@ -883,7 +883,9 @@ def __getstate__(self): key: None if key == "raw_julia_state_" else value for key, value in state.items() } - if "equations_" in pickled_state: + if ("equations_" in pickled_state) and ( + pickled_state["equations_"] is not None + ): pickled_state["output_torch_format"] = False pickled_state["output_jax_format"] = False if self.nout_ == 1: From dde0ef7e3c2606415b9d7c03c56370402c398e3e Mon Sep 17 00:00:00 2001 From: MilesCranmer Date: Sun, 24 Jul 2022 22:45:13 -0400 Subject: [PATCH 09/40] Remove extra_sympy_mappings from pickle file --- pysr/sr.py | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/pysr/sr.py b/pysr/sr.py index 67c11f58c..e147d08a7 100644 --- a/pysr/sr.py +++ b/pysr/sr.py @@ -562,6 +562,9 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator): equation_file_contents_ : list[pandas.DataFrame] Contents of the equation file output by the Julia backend. + show_pickle_warnings_ : bool + Whether to show warnings about what attributes can be pickled. + Notes ----- Most default parameters have been tuned over several example equations, @@ -873,14 +876,26 @@ def __getstate__(self): from the pickled instance. """ state = self.__dict__ - if "raw_julia_state_" in state: + show_pickle_warning = not ( + "show_pickle_warnings_" in state and not state["show_pickle_warnings_"] + ) + if "raw_julia_state_" in state and show_pickle_warning: warnings.warn( "raw_julia_state_ cannot be pickled and will be removed from the " "serialized instance. This will prevent a `warm_start` fit of any " "model that is deserialized via `pickle.load()`." ) + state_keys_containing_lambdas = ["extra_sympy_mappings", "extra_torch_mappings"] + for state_key in state_keys_containing_lambdas: + if state[state_key] is not None and show_pickle_warning: + warnings.warn( + f"`{state_key}` cannot be pickled and will be removed from the " + "serialized instance. When loading the model, please redefine " + f"`{state_key}` at runtime." + ) + state_keys_to_clear = ["raw_julia_state_"] + state_keys_containing_lambdas pickled_state = { - key: None if key == "raw_julia_state_" else value + key: (None if key in state_keys_to_clear else value) for key, value in state.items() } if ("equations_" in pickled_state) and ( From b16d9efb3c83ced7870fbb7641fa97cfd9452a2d Mon Sep 17 00:00:00 2001 From: MilesCranmer Date: Sun, 24 Jul 2022 22:45:42 -0400 Subject: [PATCH 10/40] Automatically pickle file at initialization --- pysr/sr.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/pysr/sr.py b/pysr/sr.py index e147d08a7..63d74fe61 100644 --- a/pysr/sr.py +++ b/pysr/sr.py @@ -1623,6 +1623,11 @@ def fit( y, ) + # Save model state: + self.show_pickle_warnings_ = False + with open(str(self.equation_file_) + ".pkl", "wb") as f: + pkl.dump(self, f) + self.show_pickle_warnings_ = True # Fitting procedure return self._run(X, y, mutated_params, weights=weights, seed=seed) From 5c0ad5569248da926a646aeb6194ad9d03fc9844 Mon Sep 17 00:00:00 2001 From: MilesCranmer Date: Sun, 24 Jul 2022 22:46:11 -0400 Subject: [PATCH 11/40] Allow loading from pickle file --- pysr/sr.py | 27 ++++++++++++++++++++++++--- 1 file changed, 24 insertions(+), 3 deletions(-) diff --git a/pysr/sr.py b/pysr/sr.py index 63d74fe61..cd851c16b 100644 --- a/pysr/sr.py +++ b/pysr/sr.py @@ -2061,9 +2061,9 @@ def run_feature_selection(X, y, select_k_features, random_state=None): def load( equation_file, *, - binary_operators, - unary_operators, - n_features_in, + binary_operators=None, + unary_operators=None, + n_features_in=None, feature_names_in=None, selection_mask=None, nout=1, @@ -2097,12 +2097,33 @@ def load( pysr_kwargs : dict Any other keyword arguments to initialize the PySRRegressor object. + These will overwrite those stored in the pickle file. Returns ------- model : PySRRegressor The model with fitted equations. """ + # Try to load model from .pkl + print(f"Checking if {equation_file}.pkl exists...") + if os.path.exists(str(equation_file) + ".pkl"): + assert binary_operators is None + assert unary_operators is None + assert n_features_in is None + with open(str(equation_file) + ".pkl", "rb") as f: + model = pkl.load(f) + model.set_params(**pysr_kwargs) + model.refresh() + return model + + # Else, we re-create it. + print( + f"{equation_file}.pkl does not exist, " + "so we must create the model from scratch." + ) + assert binary_operators is not None + assert unary_operators is not None + assert n_features_in is not None # TODO: copy .bkup file if exists. model = PySRRegressor( From dc1d66378e25d7a8ef5d45811e0a67d0b00c449e Mon Sep 17 00:00:00 2001 From: MilesCranmer Date: Sun, 24 Jul 2022 22:46:43 -0400 Subject: [PATCH 12/40] Add pickle files to gitignore --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index 23004701d..f0daf5e88 100644 --- a/.gitignore +++ b/.gitignore @@ -3,6 +3,7 @@ *.csv *.csv.out* *.bkup +*.pkl performance*txt *.out trials* From 4ae8a5c2380b0fa6dd34f2f56207d2dc3970a362 Mon Sep 17 00:00:00 2001 From: MilesCranmer Date: Sun, 24 Jul 2022 22:47:00 -0400 Subject: [PATCH 13/40] Add missing pickle import --- pysr/sr.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pysr/sr.py b/pysr/sr.py index cd851c16b..c093d3619 100644 --- a/pysr/sr.py +++ b/pysr/sr.py @@ -8,6 +8,7 @@ import tempfile import shutil from pathlib import Path +import pickle as pkl from datetime import datetime import warnings from multiprocessing import cpu_count From 78cdb0e736efe0575ecd7797e7ad7e07b6ecd447 Mon Sep 17 00:00:00 2001 From: MilesCranmer Date: Sun, 24 Jul 2022 22:47:15 -0400 Subject: [PATCH 14/40] Add test for loading from pickle file --- test/test.py | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/test/test.py b/test/test.py index 59c7d76bc..33999e087 100644 --- a/test/test.py +++ b/test/test.py @@ -309,6 +309,33 @@ def test_load_model(self): np.testing.assert_allclose(y_truth, y_test) + def test_load_model_simple(self): + # Test that we can simply load a model from its equation file. + y = self.X[:, [0, 1]] ** 2 + model = PySRRegressor( + # Test that passing a single operator works: + unary_operators="sq(x) = x^2", + binary_operators="plus", + extra_sympy_mappings={"sq": lambda x: x**2}, + **self.default_test_kwargs, + procs=0, + denoise=True, + early_stop_condition="stop_if(loss, complexity) = loss < 0.05 && complexity == 2", + ) + rand_dir = Path(tempfile.mkdtemp()) + equation_file = rand_dir / "equations.csv" + model.set_params(temp_equation_file=False) + model.set_params(equation_file=equation_file) + model.fit(self.X, y) + + # lambda functions are removed from the pickling, so we need + # to pass it during the loading: + model2 = load( + model.equation_file_, extra_sympy_mappings={"sq": lambda x: x**2} + ) + + np.testing.assert_allclose(model.predict(self.X), model2.predict(self.X)) + class TestBest(unittest.TestCase): def setUp(self): From 8da5000dfc58c1a035c623ebd6c6e2acea472134 Mon Sep 17 00:00:00 2001 From: MilesCranmer Date: Sun, 24 Jul 2022 22:57:40 -0400 Subject: [PATCH 15/40] Improve error message for missing operator mappings --- pysr/sr.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pysr/sr.py b/pysr/sr.py index 3e2112975..736be313e 100644 --- a/pysr/sr.py +++ b/pysr/sr.py @@ -1694,7 +1694,8 @@ def predict(self, X, index=None): raise ValueError( "Failed to evaluate the expression. " "If you are using a custom operator, make sure to define it in :param`extra_sympy_mappings`, " - "e.g., `model.set_params(extra_sympy_mappings={'inv': lambda x: 1 / x})`." + "e.g., `model.set_params(extra_sympy_mappings={'inv': lambda x: 1 / x})`. You can then " + "run `model.refresh()` to re-load the expressions." ) from error def sympy(self, index=None): From 214744b5ce5f1a375f5af936b774ca3a3b26bdd4 Mon Sep 17 00:00:00 2001 From: MilesCranmer Date: Sun, 24 Jul 2022 22:59:41 -0400 Subject: [PATCH 16/40] Fix filename concat in test --- test/test.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/test.py b/test/test.py index 33999e087..0e1108400 100644 --- a/test/test.py +++ b/test/test.py @@ -294,7 +294,7 @@ def test_load_model(self): equation_filename = rand_dir / "equation.csv" with open(equation_filename, "w") as f: f.write(csv_file_data) - with open(equation_filename + ".bkup", "w") as f: + with open(str(equation_filename) + ".bkup", "w") as f: f.write(csv_file_data) model = load( equation_filename, From 58e25a9d7261ceb4509125d6f8d102b68bfe5fd4 Mon Sep 17 00:00:00 2001 From: MilesCranmer Date: Sun, 24 Jul 2022 23:30:52 -0400 Subject: [PATCH 17/40] Test both with and without `bkup` file --- test/test.py | 34 +++++++++++++++++----------------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/test/test.py b/test/test.py index 0e1108400..30a50d1cc 100644 --- a/test/test.py +++ b/test/test.py @@ -290,24 +290,24 @@ def test_load_model(self): 4|0.104823045|pow_abs(2.2683423, cos(f3))""" # Strip the indents: csv_file_data = "\n".join([l.strip() for l in csv_file_data.split("\n")]) - rand_dir = Path(tempfile.mkdtemp()) - equation_filename = rand_dir / "equation.csv" - with open(equation_filename, "w") as f: - f.write(csv_file_data) - with open(str(equation_filename) + ".bkup", "w") as f: - f.write(csv_file_data) - model = load( - equation_filename, - n_features_in=5, - feature_names_in=["f0", "f1", "f2", "f3", "f4"], - binary_operators=["+", "*", "/", "-", "^"], - unary_operators=["cos"], - ) - X = self.rstate.rand(100, 5) - y_truth = 2.2683423 ** np.cos(X[:, 3]) - y_test = model.predict(X, 2) - np.testing.assert_allclose(y_truth, y_test) + for from_backup in [False, True]: + rand_dir = Path(tempfile.mkdtemp()) + equation_filename = str(rand_dir / "equation.csv") + with open(equation_filename + (".bkup" if from_backup else ""), "w") as f: + f.write(csv_file_data) + model = load( + equation_filename, + n_features_in=5, + feature_names_in=["f0", "f1", "f2", "f3", "f4"], + binary_operators=["+", "*", "/", "-", "^"], + unary_operators=["cos"], + ) + X = self.rstate.rand(100, 5) + y_truth = 2.2683423 ** np.cos(X[:, 3]) + y_test = model.predict(X, 2) + + np.testing.assert_allclose(y_truth, y_test) def test_load_model_simple(self): # Test that we can simply load a model from its equation file. From 1f019764c8af52477ed2f066b50e17e7474ec26e Mon Sep 17 00:00:00 2001 From: MilesCranmer Date: Sun, 24 Jul 2022 23:31:28 -0400 Subject: [PATCH 18/40] Don't check for `equation_file_` until after checkpoint_file set --- pysr/sr.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pysr/sr.py b/pysr/sr.py index c093d3619..be9224893 100644 --- a/pysr/sr.py +++ b/pysr/sr.py @@ -1642,10 +1642,10 @@ def refresh(self, checkpoint_file=None): checkpoint_file : str, default=None Path to checkpoint hall of fame file to be loaded. """ - check_is_fitted(self, attributes=["equation_file_"]) if checkpoint_file: self.equation_file_ = checkpoint_file self.equation_file_contents_ = None + check_is_fitted(self, attributes=["equation_file_"]) self.equations_ = self.get_hof() def predict(self, X, index=None): From f1ac7043f981a5d2fa1202d172986ab9ee261f99 Mon Sep 17 00:00:00 2001 From: MilesCranmer Date: Sun, 24 Jul 2022 23:32:00 -0400 Subject: [PATCH 19/40] Allow both `bkup` and `csv` file --- pysr/sr.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/pysr/sr.py b/pysr/sr.py index be9224893..b53d222a0 100644 --- a/pysr/sr.py +++ b/pysr/sr.py @@ -1834,10 +1834,10 @@ def _read_equation_file(self): if self.nout_ > 1: all_outputs = [] for i in range(1, self.nout_ + 1): - df = pd.read_csv( - str(self.equation_file_) + f".out{i}" + ".bkup", - sep="|", - ) + cur_filename = str(self.equation_file_) + f".out{i}" + ".bkup" + if not os.path.exists(cur_filename): + cur_filename = str(self.equation_file_) + f".out{i}" + df = pd.read_csv(cur_filename, sep="|") # Rename Complexity column to complexity: df.rename( columns={ @@ -1850,7 +1850,10 @@ def _read_equation_file(self): all_outputs.append(df) else: - all_outputs = [pd.read_csv(str(self.equation_file_) + ".bkup", sep="|")] + filename = str(self.equation_file_) + ".bkup" + if not os.path.exists(filename): + filename = str(self.equation_file_) + all_outputs = [pd.read_csv(filename, sep="|")] all_outputs[-1].rename( columns={ "Complexity": "complexity", From c6902b714c3a993d00d41a90100cc6de79c5f50f Mon Sep 17 00:00:00 2001 From: MilesCranmer Date: Sun, 24 Jul 2022 23:32:41 -0400 Subject: [PATCH 20/40] Additional logging messages during load --- pysr/sr.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pysr/sr.py b/pysr/sr.py index b53d222a0..d669147e2 100644 --- a/pysr/sr.py +++ b/pysr/sr.py @@ -2111,6 +2111,7 @@ def load( # Try to load model from .pkl print(f"Checking if {equation_file}.pkl exists...") if os.path.exists(str(equation_file) + ".pkl"): + print(f"Loading model from {equation_file}.pkl.") assert binary_operators is None assert unary_operators is None assert n_features_in is None From 6501ca074793bde8af2ef943394c4e03ef43a775 Mon Sep 17 00:00:00 2001 From: MilesCranmer Date: Mon, 1 Aug 2022 14:24:05 -0400 Subject: [PATCH 21/40] Checkpoint model before and after fit --- pysr/sr.py | 29 ++++++++++++++++++++++------- 1 file changed, 22 insertions(+), 7 deletions(-) diff --git a/pysr/sr.py b/pysr/sr.py index d669147e2..2d8e5463b 100644 --- a/pysr/sr.py +++ b/pysr/sr.py @@ -924,6 +924,16 @@ def __getstate__(self): ] return pickled_state + def _checkpoint(self): + """Saves the model's current state to a checkpoint file. + + This should only be used internally by PySRRegressor.""" + # Save model state: + self.show_pickle_warnings_ = False + with open(str(self.equation_file_) + ".pkl", "wb") as f: + pkl.dump(self, f) + self.show_pickle_warnings_ = True + @property def equations(self): # pragma: no cover warnings.warn( @@ -1624,13 +1634,18 @@ def fit( y, ) - # Save model state: - self.show_pickle_warnings_ = False - with open(str(self.equation_file_) + ".pkl", "wb") as f: - pkl.dump(self, f) - self.show_pickle_warnings_ = True - # Fitting procedure - return self._run(X, y, mutated_params, weights=weights, seed=seed) + # Initially, just save model parameters, so that + # it can be loaded from an early exit: + self._checkpoint() + + # Perform the search: + self._run(X, y, mutated_params, weights=weights, seed=seed) + + # Then, after fit, we save again, so the pickle file contains + # the equations: + self._checkpoint() + + return self def refresh(self, checkpoint_file=None): """ From b53e7fafda3ee0a06d9e8ee56f98cc46bd7ddd57 Mon Sep 17 00:00:00 2001 From: MilesCranmer Date: Mon, 1 Aug 2022 14:40:12 -0400 Subject: [PATCH 22/40] Add additional test for loading from pickle file --- pysr/sr.py | 8 ++++++-- test/test.py | 10 ++++++++++ 2 files changed, 16 insertions(+), 2 deletions(-) diff --git a/pysr/sr.py b/pysr/sr.py index 2d8e5463b..f99180dd2 100644 --- a/pysr/sr.py +++ b/pysr/sr.py @@ -926,7 +926,7 @@ def __getstate__(self): def _checkpoint(self): """Saves the model's current state to a checkpoint file. - + This should only be used internally by PySRRegressor.""" # Save model state: self.show_pickle_warnings_ = False @@ -2132,8 +2132,12 @@ def load( assert n_features_in is None with open(str(equation_file) + ".pkl", "rb") as f: model = pkl.load(f) + # Update any parameters if necessary, such as + # extra_sympy_mappings: model.set_params(**pysr_kwargs) - model.refresh() + if "equations_" not in model.__dict__ or model.equations_ is None: + model.refresh() + return model # Else, we re-create it. diff --git a/test/test.py b/test/test.py index 30a50d1cc..f5e84570e 100644 --- a/test/test.py +++ b/test/test.py @@ -336,6 +336,16 @@ def test_load_model_simple(self): np.testing.assert_allclose(model.predict(self.X), model2.predict(self.X)) + # Try again, but using only the pickle file: + for file_to_delete in [str(equation_file), str(equation_file) + ".bkup"]: + if os.path.exists(file_to_delete): + os.remove(file_to_delete) + + model3 = load( + model.equation_file_, extra_sympy_mappings={"sq": lambda x: x**2} + ) + np.testing.assert_allclose(model.predict(self.X), model3.predict(self.X)) + class TestBest(unittest.TestCase): def setUp(self): From b8a97f177e29858c39aaeabd1d998b5207be2c95 Mon Sep 17 00:00:00 2001 From: MilesCranmer Date: Thu, 4 Aug 2022 15:23:41 -0400 Subject: [PATCH 23/40] Use .pkl instead of .csv.pkl --- pysr/sr.py | 38 ++++++++++++++++++++++++++++---------- test/test.py | 21 ++++++++++++++++++++- 2 files changed, 48 insertions(+), 11 deletions(-) diff --git a/pysr/sr.py b/pysr/sr.py index f99180dd2..d1a209cd6 100644 --- a/pysr/sr.py +++ b/pysr/sr.py @@ -930,7 +930,7 @@ def _checkpoint(self): This should only be used internally by PySRRegressor.""" # Save model state: self.show_pickle_warnings_ = False - with open(str(self.equation_file_) + ".pkl", "wb") as f: + with open(_csv_filename_to_pkl_filename(self.equation_file_), "wb") as f: pkl.dump(self, f) self.show_pickle_warnings_ = True @@ -1636,14 +1636,16 @@ def fit( # Initially, just save model parameters, so that # it can be loaded from an early exit: - self._checkpoint() + if not self.temp_equation_file: + self._checkpoint() # Perform the search: self._run(X, y, mutated_params, weights=weights, seed=seed) # Then, after fit, we save again, so the pickle file contains # the equations: - self._checkpoint() + if not self.temp_equation_file: + self._checkpoint() return self @@ -2077,6 +2079,17 @@ def run_feature_selection(X, y, select_k_features, random_state=None): return selector.get_support(indices=True) +def _csv_filename_to_pkl_filename(csv_filename) -> str: + # Assume that the csv filename is of the form "foo.csv" + dirname = str(os.path.dirname(csv_filename)) + basename = str(os.path.basename(csv_filename)) + base = str(os.path.splitext(basename)[0]) + + pkl_basename = base + ".pkl" + + return os.path.join(dirname, pkl_basename) + + def load( equation_file, *, @@ -2094,7 +2107,8 @@ def load( Parameters ---------- equation_file : str - Path to a csv file containing equations. + Path to a csv file containing equations, or a pickle file + containing the model. binary_operators : list[str], default=["+", "-", "*", "/"] The same binary operators used when creating the model. @@ -2123,14 +2137,19 @@ def load( model : PySRRegressor The model with fitted equations. """ + if os.path.splitext(equation_file)[1] != ".pkl": + pkl_filename = _csv_filename_to_pkl_filename(equation_file) + else: + pkl_filename = equation_file + # Try to load model from .pkl - print(f"Checking if {equation_file}.pkl exists...") - if os.path.exists(str(equation_file) + ".pkl"): - print(f"Loading model from {equation_file}.pkl.") + print(f"Checking if {pkl_filename} exists...") + if os.path.exists(pkl_filename): + print(f"Loading model from {pkl_filename}") assert binary_operators is None assert unary_operators is None assert n_features_in is None - with open(str(equation_file) + ".pkl", "rb") as f: + with open(pkl_filename, "rb") as f: model = pkl.load(f) # Update any parameters if necessary, such as # extra_sympy_mappings: @@ -2142,8 +2161,7 @@ def load( # Else, we re-create it. print( - f"{equation_file}.pkl does not exist, " - "so we must create the model from scratch." + f"{equation_file} does not exist, " "so we must create the model from scratch." ) assert binary_operators is not None assert unary_operators is not None diff --git a/test/test.py b/test/test.py index f5e84570e..dd07c601f 100644 --- a/test/test.py +++ b/test/test.py @@ -5,7 +5,11 @@ import numpy as np from sklearn import model_selection from pysr import PySRRegressor, load -from pysr.sr import run_feature_selection, _handle_feature_selection +from pysr.sr import ( + run_feature_selection, + _handle_feature_selection, + _csv_filename_to_pkl_filename, +) from sklearn.utils.estimator_checks import check_estimator import sympy import pandas as pd @@ -341,6 +345,7 @@ def test_load_model_simple(self): if os.path.exists(file_to_delete): os.remove(file_to_delete) + pickle_file = rand_dir / "equations.pkl" model3 = load( model.equation_file_, extra_sympy_mappings={"sq": lambda x: x**2} ) @@ -430,6 +435,20 @@ def test_feature_selection_handler(self): class TestMiscellaneous(unittest.TestCase): """Test miscellaneous functions.""" + def test_csv_to_pkl_conversion(self): + """Test that csv filename to pkl filename works as expected.""" + tmpdir = Path(tempfile.mkdtemp()) + equation_file = tmpdir / "equations.389479384.28378374.csv" + expected_pkl_file = tmpdir / "equations.389479384.28378374.pkl" + + # First, test inputting the paths: + test_pkl_file = _csv_filename_to_pkl_filename(equation_file) + self.assertEqual(test_pkl_file, str(expected_pkl_file)) + + # Next, test inputting the strings. + test_pkl_file = _csv_filename_to_pkl_filename(str(equation_file)) + self.assertEqual(test_pkl_file, str(expected_pkl_file)) + def test_deprecation(self): """Ensure that deprecation works as expected. From a6bed2c01177ba435c141e1cd540409c3d3e34ec Mon Sep 17 00:00:00 2001 From: MilesCranmer Date: Thu, 4 Aug 2022 18:25:42 -0400 Subject: [PATCH 24/40] Fix bug with inplace editing of equation_file_contents_ --- pysr/sr.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/pysr/sr.py b/pysr/sr.py index d1a209cd6..3a5415554 100644 --- a/pysr/sr.py +++ b/pysr/sr.py @@ -1,3 +1,4 @@ +import copy import os import sys import numpy as np @@ -1928,7 +1929,9 @@ def get_hof(self): ret_outputs = [] - for output in self.equation_file_contents_: + equation_file_contents = copy.deepcopy(self.equation_file_contents_) + + for output in equation_file_contents: scores = [] lastMSE = None From f5577eac29e49fe913ce12a214b88cba787f2e6d Mon Sep 17 00:00:00 2001 From: MilesCranmer Date: Thu, 4 Aug 2022 18:57:30 -0400 Subject: [PATCH 25/40] Reduce precision of tests --- test/test.py | 6 +++--- test/test_jax.py | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/test/test.py b/test/test.py index dd07c601f..dd1ece29f 100644 --- a/test/test.py +++ b/test/test.py @@ -140,7 +140,7 @@ def test_multioutput_weighted_with_callable_temp_equation(self): # These tests are flaky, so don't fail test: try: np.testing.assert_almost_equal( - model.predict(X.copy())[:, 0], X[:, 0] ** 2, decimal=4 + model.predict(X.copy())[:, 0], X[:, 0] ** 2, decimal=3 ) except AssertionError: print("Error in test_multioutput_weighted_with_callable_temp_equation") @@ -149,7 +149,7 @@ def test_multioutput_weighted_with_callable_temp_equation(self): try: np.testing.assert_almost_equal( - model.predict(X.copy())[:, 1], X[:, 1] ** 2, decimal=4 + model.predict(X.copy())[:, 1], X[:, 1] ** 2, decimal=3 ) except AssertionError: print("Error in test_multioutput_weighted_with_callable_temp_equation") @@ -401,7 +401,7 @@ def test_best_lambda(self): X = self.X y = self.y for f in [self.model.predict, self.equations_.iloc[-1]["lambda_format"]]: - np.testing.assert_almost_equal(f(X), y, decimal=4) + np.testing.assert_almost_equal(f(X), y, decimal=3) class TestFeatureSelection(unittest.TestCase): diff --git a/test/test_jax.py b/test/test_jax.py index 58d1a6067..e885a8d3b 100644 --- a/test/test_jax.py +++ b/test/test_jax.py @@ -76,7 +76,7 @@ def test_pipeline(self): np.testing.assert_almost_equal( np.array(jformat["callable"](jnp.array(X), jformat["parameters"])), np.square(np.cos(X[:, 1])), # Select feature 1 - decimal=4, + decimal=3, ) def test_feature_selection_custom_operators(self): From 34f4e3f83fb1f1dd691ad5b57572bf1e7673125e Mon Sep 17 00:00:00 2001 From: MilesCranmer Date: Fri, 5 Aug 2022 00:22:07 -0400 Subject: [PATCH 26/40] Change model load to classmethod --- pysr/__init__.py | 1 - pysr/sr.py | 217 ++++++++++++++++++++++++----------------------- test/test.py | 8 +- 3 files changed, 117 insertions(+), 109 deletions(-) diff --git a/pysr/__init__.py b/pysr/__init__.py index 210e85cb7..e303becb2 100644 --- a/pysr/__init__.py +++ b/pysr/__init__.py @@ -6,7 +6,6 @@ best_tex, best_callable, best_row, - load, ) from .julia_helpers import install from .feynman_problems import Problem, FeynmanProblem diff --git a/pysr/sr.py b/pysr/sr.py index 3a5415554..e98d36b67 100644 --- a/pysr/sr.py +++ b/pysr/sr.py @@ -810,6 +810,119 @@ def __init__( f"{k} is not a valid keyword argument for PySRRegressor." ) + @classmethod + def from_file( + cls, + equation_file, + *, + binary_operators=None, + unary_operators=None, + n_features_in=None, + feature_names_in=None, + selection_mask=None, + nout=1, + **pysr_kwargs, + ): + """ + Create a model from a saved model checkpoint or equation file. + + Parameters + ---------- + equation_file : str + Path to a pickle file containing a saved model, or a csv file + containing equations. + + binary_operators : list[str] + The same binary operators used when creating the model. + Not needed if loading from a pickle file. + + unary_operators : list[str] + The same unary operators used when creating the model. + Not needed if loading from a pickle file. + + n_features_in : int + Number of features passed to the model. + Not needed if loading from a pickle file. + + feature_names_in : list[str] + Names of the features passed to the model. + Not needed if loading from a pickle file. + + selection_mask : list[bool] + If using select_k_features, you must pass `model.selection_mask_` here. + Not needed if loading from a pickle file. + + nout : int, default=1 + Number of outputs of the model. + Not needed if loading from a pickle file. + + pysr_kwargs : dict + Any other keyword arguments to initialize the PySRRegressor object. + These will overwrite those stored in the pickle file. + Not needed if loading from a pickle file. + + Returns + ------- + model : PySRRegressor + The model with fitted equations. + """ + if os.path.splitext(equation_file)[1] != ".pkl": + pkl_filename = _csv_filename_to_pkl_filename(equation_file) + else: + pkl_filename = equation_file + + # Try to load model from .pkl + print(f"Checking if {pkl_filename} exists...") + if os.path.exists(pkl_filename): + print(f"Loading model from {pkl_filename}") + assert binary_operators is None + assert unary_operators is None + assert n_features_in is None + with open(pkl_filename, "rb") as f: + model = pkl.load(f) + # Update any parameters if necessary, such as + # extra_sympy_mappings: + model.set_params(**pysr_kwargs) + if "equations_" not in model.__dict__ or model.equations_ is None: + model.refresh() + + return model + + # Else, we re-create it. + print( + f"{equation_file} does not exist, " + "so we must create the model from scratch." + ) + assert binary_operators is not None + assert unary_operators is not None + assert n_features_in is not None + + # TODO: copy .bkup file if exists. + model = cls( + equation_file=equation_file, + binary_operators=binary_operators, + unary_operators=unary_operators, + **pysr_kwargs, + ) + + model.nout_ = nout + model.n_features_in_ = n_features_in + + if feature_names_in is None: + model.feature_names_in_ = [f"x{i}" for i in range(n_features_in)] + else: + assert len(feature_names_in) == n_features_in + model.feature_names_in_ = feature_names_in + + if selection_mask is None: + model.selection_mask_ = np.ones(n_features_in, dtype=bool) + else: + model.selection_mask_ = selection_mask + + model.refresh(checkpoint_file=equation_file) + + return model + def __repr__(self): """ Prints all current equations fitted by the model. @@ -2091,107 +2204,3 @@ def _csv_filename_to_pkl_filename(csv_filename) -> str: pkl_basename = base + ".pkl" return os.path.join(dirname, pkl_basename) - - -def load( - equation_file, - *, - binary_operators=None, - unary_operators=None, - n_features_in=None, - feature_names_in=None, - selection_mask=None, - nout=1, - **pysr_kwargs, -): - """ - Create a model from equations stored as a csv file - - Parameters - ---------- - equation_file : str - Path to a csv file containing equations, or a pickle file - containing the model. - - binary_operators : list[str], default=["+", "-", "*", "/"] - The same binary operators used when creating the model. - - unary_operators : list[str], default=None - The same unary operators used when creating the model. - - n_features_in : int - Number of features passed to the model. - - feature_names_in : list[str], default=None - Names of the features passed to the model. - - selection_mask : list[bool], default=None - If using select_k_features, you must pass `model.selection_mask_` here. - - nout : int, default=1 - Number of outputs of the model. - - pysr_kwargs : dict - Any other keyword arguments to initialize the PySRRegressor object. - These will overwrite those stored in the pickle file. - - Returns - ------- - model : PySRRegressor - The model with fitted equations. - """ - if os.path.splitext(equation_file)[1] != ".pkl": - pkl_filename = _csv_filename_to_pkl_filename(equation_file) - else: - pkl_filename = equation_file - - # Try to load model from .pkl - print(f"Checking if {pkl_filename} exists...") - if os.path.exists(pkl_filename): - print(f"Loading model from {pkl_filename}") - assert binary_operators is None - assert unary_operators is None - assert n_features_in is None - with open(pkl_filename, "rb") as f: - model = pkl.load(f) - # Update any parameters if necessary, such as - # extra_sympy_mappings: - model.set_params(**pysr_kwargs) - if "equations_" not in model.__dict__ or model.equations_ is None: - model.refresh() - - return model - - # Else, we re-create it. - print( - f"{equation_file} does not exist, " "so we must create the model from scratch." - ) - assert binary_operators is not None - assert unary_operators is not None - assert n_features_in is not None - - # TODO: copy .bkup file if exists. - model = PySRRegressor( - equation_file=equation_file, - binary_operators=binary_operators, - unary_operators=unary_operators, - **pysr_kwargs, - ) - - model.nout_ = nout - model.n_features_in_ = n_features_in - - if feature_names_in is None: - model.feature_names_in_ = [f"x{i}" for i in range(n_features_in)] - else: - assert len(feature_names_in) == n_features_in - model.feature_names_in_ = feature_names_in - - if selection_mask is None: - model.selection_mask_ = np.ones(n_features_in, dtype=bool) - else: - model.selection_mask_ = selection_mask - - model.refresh(checkpoint_file=equation_file) - - return model diff --git a/test/test.py b/test/test.py index dd1ece29f..fcde9ff9e 100644 --- a/test/test.py +++ b/test/test.py @@ -4,7 +4,7 @@ import unittest import numpy as np from sklearn import model_selection -from pysr import PySRRegressor, load +from pysr import PySRRegressor from pysr.sr import ( run_feature_selection, _handle_feature_selection, @@ -300,7 +300,7 @@ def test_load_model(self): equation_filename = str(rand_dir / "equation.csv") with open(equation_filename + (".bkup" if from_backup else ""), "w") as f: f.write(csv_file_data) - model = load( + model = PySRRegressor.from_file( equation_filename, n_features_in=5, feature_names_in=["f0", "f1", "f2", "f3", "f4"], @@ -334,7 +334,7 @@ def test_load_model_simple(self): # lambda functions are removed from the pickling, so we need # to pass it during the loading: - model2 = load( + model2 = PySRRegressor.from_file( model.equation_file_, extra_sympy_mappings={"sq": lambda x: x**2} ) @@ -346,7 +346,7 @@ def test_load_model_simple(self): os.remove(file_to_delete) pickle_file = rand_dir / "equations.pkl" - model3 = load( + model3 = PySRRegressor.from_file( model.equation_file_, extra_sympy_mappings={"sq": lambda x: x**2} ) np.testing.assert_allclose(model.predict(self.X), model3.predict(self.X)) From 07217e13f141a25acf9b50e793ac84f399a45ecc Mon Sep 17 00:00:00 2001 From: MilesCranmer Date: Fri, 5 Aug 2022 00:26:20 -0400 Subject: [PATCH 27/40] Add assertion for csv filename --- pysr/sr.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pysr/sr.py b/pysr/sr.py index e98d36b67..198867f15 100644 --- a/pysr/sr.py +++ b/pysr/sr.py @@ -2197,6 +2197,8 @@ def run_feature_selection(X, y, select_k_features, random_state=None): def _csv_filename_to_pkl_filename(csv_filename) -> str: # Assume that the csv filename is of the form "foo.csv" + assert csv_filename.endswith(".csv") + dirname = str(os.path.dirname(csv_filename)) basename = str(os.path.basename(csv_filename)) base = str(os.path.splitext(basename)[0]) From f5a5c8e7b1a58651532a7d8dd2d0322f949ccce0 Mon Sep 17 00:00:00 2001 From: MilesCranmer Date: Fri, 5 Aug 2022 16:16:04 -0400 Subject: [PATCH 28/40] Fix assertion on csv filenames --- pysr/sr.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pysr/sr.py b/pysr/sr.py index 198867f15..8956d0960 100644 --- a/pysr/sr.py +++ b/pysr/sr.py @@ -2197,7 +2197,7 @@ def run_feature_selection(X, y, select_k_features, random_state=None): def _csv_filename_to_pkl_filename(csv_filename) -> str: # Assume that the csv filename is of the form "foo.csv" - assert csv_filename.endswith(".csv") + assert str(csv_filename).endswith(".csv") dirname = str(os.path.dirname(csv_filename)) basename = str(os.path.basename(csv_filename)) From 9433a8315f8134697e259f5c881f5e3d15ad36bd Mon Sep 17 00:00:00 2001 From: MilesCranmer Date: Wed, 10 Aug 2022 10:30:24 +0300 Subject: [PATCH 29/40] Add README example for from_file --- README.md | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index a3630781c..fa620155d 100644 --- a/README.md +++ b/README.md @@ -162,7 +162,15 @@ This arrow in the `pick` column indicates which equation is currently selected b SymPy format (`sympy_format` - which you can also get with `model.sympy()`), and even JAX and PyTorch format (both of which are differentiable - which you can get with `model.jax()` and `model.pytorch()`). -Note that `PySRRegressor` stores the state of the last search, and will restart from where you left off the next time you call `.fit()`. This will cause problems if significant changes are made to the search parameters (like changing the operators). You can run `model.reset()` to reset the state. +Note that `PySRRegressor` stores the state of the last search, and will restart from where you left off the next time you call `.fit()`, assuming you have set `warm_start=True`. +This will cause problems if significant changes are made to the search parameters (like changing the operators). You can run `model.reset()` to reset the state. + +You will notice that PySR will save two files: `hall_of_fame...csv` and `hall_of_fame...pkl`. +The csv file is a list of equations and their losses, and the pkl file is a saved state of the model. +You may load the model from the `pkl` file with: +```python +model = PySRRegressor.from_file("hall_of_fame.2022-08-10_100832.281.pkl") +``` There are several other useful features such as denoising (e.g., `denoising=True`), feature selection (e.g., `select_k_features=3`). From 87750391b3c312f7e5398e7c4d4940ab96574e08 Mon Sep 17 00:00:00 2001 From: MilesCranmer Date: Wed, 10 Aug 2022 11:17:14 +0300 Subject: [PATCH 30/40] Add notes about model loading to options page --- docs/options.md | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/docs/options.md b/docs/options.md index 942f7af7c..914d7490f 100644 --- a/docs/options.md +++ b/docs/options.md @@ -16,6 +16,7 @@ may find useful include: - LaTeX, SymPy - Callable exports: numpy, pytorch, jax - `loss` +- Model loading These are described below @@ -252,3 +253,20 @@ Can also uses these losses for weighted (weighted-average): model = PySRRegressor(..., weights=weights, loss="LPDistLoss{3}()") model.fit(..., weights=weights) ``` + +## Model loading + +PySR will automatically save a pickle file of the model state +when you call `model.fit`, once before the search starts, +and again after the search finishes. The filename will +have the same base name as the input file, but with a `.pkl` extension. +You can load the saved model state with: +```python +model = PySRRegressor.from_file(pickle_filename) +``` +If you have a long-running job and would like to load the model +before completion, you can also do this. In this case, the model +loading will use the `csv` file to load the equations, since the +`csv` file is continually updated during the search. Once +the search completes, the model including its equations will +be saved to the pickle file, overwriting the existing version. \ No newline at end of file From 45bf2c2f0a433b802675fb52621338f79edca7b5 Mon Sep 17 00:00:00 2001 From: MilesCranmer Date: Wed, 10 Aug 2022 11:44:29 +0300 Subject: [PATCH 31/40] Assume normal csv format in reads --- pysr/sr.py | 8 ++++---- pysr/version.py | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/pysr/sr.py b/pysr/sr.py index 34a1f5365..1a850bf1e 100644 --- a/pysr/sr.py +++ b/pysr/sr.py @@ -1969,12 +1969,12 @@ def _read_equation_file(self): cur_filename = str(self.equation_file_) + f".out{i}" + ".bkup" if not os.path.exists(cur_filename): cur_filename = str(self.equation_file_) + f".out{i}" - df = pd.read_csv(cur_filename, sep="|") + df = pd.read_csv(cur_filename) # Rename Complexity column to complexity: df.rename( columns={ "Complexity": "complexity", - "MSE": "loss", + "Loss": "loss", "Equation": "equation", }, inplace=True, @@ -1985,11 +1985,11 @@ def _read_equation_file(self): filename = str(self.equation_file_) + ".bkup" if not os.path.exists(filename): filename = str(self.equation_file_) - all_outputs = [pd.read_csv(filename, sep="|")] + all_outputs = [pd.read_csv(filename)] all_outputs[-1].rename( columns={ "Complexity": "complexity", - "MSE": "loss", + "Loss": "loss", "Equation": "equation", }, inplace=True, diff --git a/pysr/version.py b/pysr/version.py index 63083e35d..dfe483cc5 100644 --- a/pysr/version.py +++ b/pysr/version.py @@ -1,2 +1,2 @@ __version__ = "0.9.5" -__symbolic_regression_jl_version__ = "0.9.7" +__symbolic_regression_jl_version__ = "0.10.0" From 6bd5a10033db15ed9daebf79453607329621d0fc Mon Sep 17 00:00:00 2001 From: MilesCranmer Date: Wed, 10 Aug 2022 11:45:12 +0300 Subject: [PATCH 32/40] Update docs on expected csv file --- pysr/sr.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pysr/sr.py b/pysr/sr.py index 1a850bf1e..8d535c77b 100644 --- a/pysr/sr.py +++ b/pysr/sr.py @@ -469,7 +469,7 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator): Whether to use a progress bar instead of printing to stdout. equation_file : str, default=None - Where to save the files (.csv separated by |). + Where to save the files (with `.csv` extension). temp_equation_file : bool, default=False Whether to put the hall of fame file in the temp directory. From 593c6741708b84f23705cb1748f072d07c5322c0 Mon Sep 17 00:00:00 2001 From: MilesCranmer Date: Wed, 10 Aug 2022 11:46:10 +0300 Subject: [PATCH 33/40] Bump version with new csv format --- pysr/version.py | 2 +- test/test.py | 10 +++++----- test/test_jax.py | 12 ++++++------ test/test_torch.py | 18 +++++++++--------- 4 files changed, 21 insertions(+), 21 deletions(-) diff --git a/pysr/version.py b/pysr/version.py index dfe483cc5..9429c9aa6 100644 --- a/pysr/version.py +++ b/pysr/version.py @@ -1,2 +1,2 @@ -__version__ = "0.9.5" +__version__ = "0.10.0" __symbolic_regression_jl_version__ = "0.10.0" diff --git a/test/test.py b/test/test.py index fcde9ff9e..76fe22c87 100644 --- a/test/test.py +++ b/test/test.py @@ -288,10 +288,10 @@ def test_high_dim_selection_early_stop(self): def test_load_model(self): """See if we can load a ran model from the equation file.""" csv_file_data = """ - Complexity|MSE|Equation - 1|0.19951081|1.9762075 - 3|0.12717344|(f0 + 1.4724599) - 4|0.104823045|pow_abs(2.2683423, cos(f3))""" + Complexity,Loss,Equation + 1,0.19951081,"1.9762075" + 3,0.12717344,"(f0 + 1.4724599)" + 4,0.104823045,"pow_abs(2.2683423, cos(f3))\"""" # Strip the indents: csv_file_data = "\n".join([l.strip() for l in csv_file_data.split("\n")]) @@ -379,7 +379,7 @@ def setUp(self): self.model.selection_mask_ = None self.model.feature_names_in_ = np.array(["x0", "x1"], dtype=object) equations["complexity loss equation".split(" ")].to_csv( - "equation_file.csv.bkup", sep="|" + "equation_file.csv.bkup" ) self.model.refresh() diff --git a/test/test_jax.py b/test/test_jax.py index e885a8d3b..eb649d4fd 100644 --- a/test/test_jax.py +++ b/test/test_jax.py @@ -34,13 +34,13 @@ def test_pipeline_pandas(self): equations = pd.DataFrame( { "Equation": ["1.0", "cos(x1)", "square(cos(x1))"], - "MSE": [1.0, 0.1, 1e-5], + "Loss": [1.0, 0.1, 1e-5], "Complexity": [1, 2, 3], } ) - equations["Complexity MSE Equation".split(" ")].to_csv( - "equation_file.csv.bkup", sep="|" + equations["Complexity Loss Equation".split(" ")].to_csv( + "equation_file.csv.bkup" ) model.refresh(checkpoint_file="equation_file.csv") @@ -61,13 +61,13 @@ def test_pipeline(self): equations = pd.DataFrame( { "Equation": ["1.0", "cos(x1)", "square(cos(x1))"], - "MSE": [1.0, 0.1, 1e-5], + "Loss": [1.0, 0.1, 1e-5], "Complexity": [1, 2, 3], } ) - equations["Complexity MSE Equation".split(" ")].to_csv( - "equation_file.csv.bkup", sep="|" + equations["Complexity Loss Equation".split(" ")].to_csv( + "equation_file.csv.bkup" ) model.refresh(checkpoint_file="equation_file.csv") diff --git a/test/test_torch.py b/test/test_torch.py index 66fd2757f..c58dc1c96 100644 --- a/test/test_torch.py +++ b/test/test_torch.py @@ -49,13 +49,13 @@ def test_pipeline_pandas(self): equations = pd.DataFrame( { "Equation": ["1.0", "cos(x1)", "square(cos(x1))"], - "MSE": [1.0, 0.1, 1e-5], + "Loss": [1.0, 0.1, 1e-5], "Complexity": [1, 2, 3], } ) - equations["Complexity MSE Equation".split(" ")].to_csv( - "equation_file.csv.bkup", sep="|" + equations["Complexity Loss Equation".split(" ")].to_csv( + "equation_file.csv.bkup" ) model.refresh(checkpoint_file="equation_file.csv") @@ -82,13 +82,13 @@ def test_pipeline(self): equations = pd.DataFrame( { "Equation": ["1.0", "cos(x1)", "square(cos(x1))"], - "MSE": [1.0, 0.1, 1e-5], + "Loss": [1.0, 0.1, 1e-5], "Complexity": [1, 2, 3], } ) - equations["Complexity MSE Equation".split(" ")].to_csv( - "equation_file.csv.bkup", sep="|" + equations["Complexity Loss Equation".split(" ")].to_csv( + "equation_file.csv.bkup" ) model.refresh(checkpoint_file="equation_file.csv") @@ -133,13 +133,13 @@ def test_custom_operator(self): equations = pd.DataFrame( { "Equation": ["1.0", "mycustomoperator(x1)"], - "MSE": [1.0, 0.1], + "Loss": [1.0, 0.1], "Complexity": [1, 2], } ) - equations["Complexity MSE Equation".split(" ")].to_csv( - "equation_file_custom_operator.csv.bkup", sep="|" + equations["Complexity Loss Equation".split(" ")].to_csv( + "equation_file_custom_operator.csv.bkup" ) model.set_params( From 9351408c08cef2751fa632e6d334dd7cd09f0d6f Mon Sep 17 00:00:00 2001 From: MilesCranmer Date: Wed, 10 Aug 2022 12:36:25 +0300 Subject: [PATCH 34/40] Change "best" model_selection to apply loss threshold --- pysr/sr.py | 63 ++++++++++++++++++++++++++++++++++-------------------- 1 file changed, 40 insertions(+), 23 deletions(-) diff --git a/pysr/sr.py b/pysr/sr.py index 8d535c77b..bc6cb6d0e 100644 --- a/pysr/sr.py +++ b/pysr/sr.py @@ -205,10 +205,16 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator): Parameters ---------- model_selection : str, default="best" - Model selection criterion. Can be 'accuracy' or 'best'. - `"accuracy"` selects the candidate model with the lowest loss - (highest accuracy). `"best"` selects the candidate model with - the lowest sum of normalized loss and complexity. + Model selection criterion. Can be 'accuracy', 'best', or 'score'. + - `"accuracy"` selects the candidate model with the lowest loss + (highest accuracy). + - `"score"` selects the candidate model with the highest score. + Score is defined as the derivative of the log-loss with + respect to complexity - if an expression has a much better + oss at a slightly higher complexity, it is preferred. + - `"best"` selects the candidate model with the highest score + among expressions with a loss better than at least 1.5x the + most accurate model. binary_operators : list[str], default=["+", "-", "*", "/"] List of strings giving the binary operators in Julia's Base. @@ -469,7 +475,7 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator): Whether to use a progress bar instead of printing to stdout. equation_file : str, default=None - Where to save the files (with `.csv` extension). + Where to save the files (.csv extension). temp_equation_file : bool, default=False Whether to put the hall of fame file in the temp directory. @@ -943,12 +949,7 @@ def __repr__(self): for i, equations in enumerate(all_equations): selected = ["" for _ in range(len(equations))] - if self.model_selection == "accuracy": - chosen_row = -1 - elif self.model_selection == "best": - chosen_row = equations["score"].idxmax() - else: - raise NotImplementedError + chosen_row = idx_model_selection(equations, self.model_selection) selected[chosen_row] = ">>>>" repr_equations = pd.DataFrame( dict( @@ -1091,18 +1092,14 @@ def get_best(self, index=None): return [eq.iloc[i] for eq, i in zip(self.equations_, index)] return self.equations_.iloc[index] - if self.model_selection == "accuracy": - if isinstance(self.equations_, list): - return [eq.iloc[-1] for eq in self.equations_] - return self.equations_.iloc[-1] - elif self.model_selection == "best": - if isinstance(self.equations_, list): - return [eq.iloc[eq["score"].idxmax()] for eq in self.equations_] - return self.equations_.iloc[self.equations_["score"].idxmax()] - else: - raise NotImplementedError( - f"{self.model_selection} is not a valid model selection strategy." - ) + if isinstance(self.equations_, list): + return [ + eq.iloc[idx_model_selection(eq, self.model_selection)] + for eq in self.equations_ + ] + return self.equations_.iloc[ + idx_model_selection(self.equations_, self.model_selection) + ] def _setup_equation_file(self): """ @@ -2149,6 +2146,26 @@ def get_hof(self): return ret_outputs[0] +def idx_model_selection(equations: pd.DataFrame, model_selection: str) -> int: + """ + Return the index of the selected expression, given a dataframe of + equations and a model selection. + """ + if model_selection == "accuracy": + chosen_idx = equations["loss"].idxmin() + elif model_selection == "best": + threshold = 1.5 * equations["loss"].min() + filtered_equations = equations.query(f"loss < {threshold}") + chosen_idx = filtered_equations["score"].idxmax() + elif model_selection == "score": + chosen_idx = equations["score"].idxmax() + else: + raise NotImplementedError( + f"{model_selection} is not a valid model selection strategy." + ) + return chosen_idx + + def _denoise(X, y, Xresampled=None, random_state=None): """Denoise the dataset using a Gaussian process""" from sklearn.gaussian_process import GaussianProcessRegressor From a15823e310bb5be868748a069eb77de626401b65 Mon Sep 17 00:00:00 2001 From: MilesCranmer Date: Wed, 10 Aug 2022 12:38:21 +0300 Subject: [PATCH 35/40] Reduce precision of JAX tests --- test/test_jax.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/test/test_jax.py b/test/test_jax.py index eb649d4fd..6dccd5fe7 100644 --- a/test/test_jax.py +++ b/test/test_jax.py @@ -49,7 +49,7 @@ def test_pipeline_pandas(self): np.testing.assert_almost_equal( np.array(jformat["callable"](jnp.array(X), jformat["parameters"])), np.square(np.cos(X.values[:, 1])), # Select feature 1 - decimal=4, + decimal=3, ) def test_pipeline(self): @@ -110,5 +110,5 @@ def test_feature_selection_custom_operators(self): np_output = np_prediction(X.values) jax_output = jax_prediction(X.values) - np.testing.assert_almost_equal(y.values, np_output, decimal=4) - np.testing.assert_almost_equal(y.values, jax_output, decimal=4) + np.testing.assert_almost_equal(y.values, np_output, decimal=3) + np.testing.assert_almost_equal(y.values, jax_output, decimal=3) From 74c500587cf31b51501c9692e987b38181e0335a Mon Sep 17 00:00:00 2001 From: "deepsource-autofix[bot]" <62050782+deepsource-autofix[bot]@users.noreply.github.com> Date: Wed, 10 Aug 2022 09:41:50 +0000 Subject: [PATCH 36/40] Format code with black --- pysr/sr.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pysr/sr.py b/pysr/sr.py index bc6cb6d0e..2572953b2 100644 --- a/pysr/sr.py +++ b/pysr/sr.py @@ -210,7 +210,7 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator): (highest accuracy). - `"score"` selects the candidate model with the highest score. Score is defined as the derivative of the log-loss with - respect to complexity - if an expression has a much better + respect to complexity - if an expression has a much better oss at a slightly higher complexity, it is preferred. - `"best"` selects the candidate model with the highest score among expressions with a loss better than at least 1.5x the From 8575fba66aad9ff98fedad3fcab11dcf33539e5a Mon Sep 17 00:00:00 2001 From: MilesCranmer Date: Wed, 10 Aug 2022 12:46:53 +0300 Subject: [PATCH 37/40] Improve docstring --- pysr/sr.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pysr/sr.py b/pysr/sr.py index bc6cb6d0e..27c285ed2 100644 --- a/pysr/sr.py +++ b/pysr/sr.py @@ -205,7 +205,9 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator): Parameters ---------- model_selection : str, default="best" - Model selection criterion. Can be 'accuracy', 'best', or 'score'. + Model selection criterion when selecting a final expression from + the list of best expression at each complexity. + Can be 'accuracy', 'best', or 'score'. - `"accuracy"` selects the candidate model with the lowest loss (highest accuracy). - `"score"` selects the candidate model with the highest score. From ed5b70a1ee515f53dcfd0b5e147338a6e4712366 Mon Sep 17 00:00:00 2001 From: MilesCranmer Date: Wed, 10 Aug 2022 14:08:02 +0300 Subject: [PATCH 38/40] Fix model selection for loss=0 --- pysr/sr.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pysr/sr.py b/pysr/sr.py index fdba4ce76..3f71b60bb 100644 --- a/pysr/sr.py +++ b/pysr/sr.py @@ -2157,7 +2157,7 @@ def idx_model_selection(equations: pd.DataFrame, model_selection: str) -> int: chosen_idx = equations["loss"].idxmin() elif model_selection == "best": threshold = 1.5 * equations["loss"].min() - filtered_equations = equations.query(f"loss < {threshold}") + filtered_equations = equations.query(f"loss <= {threshold}") chosen_idx = filtered_equations["score"].idxmax() elif model_selection == "score": chosen_idx = equations["score"].idxmax() From 175b0245886720b58227fb4710cc90c17ddb944d Mon Sep 17 00:00:00 2001 From: MilesCranmer Date: Wed, 10 Aug 2022 14:23:00 +0300 Subject: [PATCH 39/40] Add unit-test for all selection strategies --- test/test.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/test/test.py b/test/test.py index 76fe22c87..f50013cd3 100644 --- a/test/test.py +++ b/test/test.py @@ -9,6 +9,7 @@ run_feature_selection, _handle_feature_selection, _csv_filename_to_pkl_filename, + idx_model_selection, ) from sklearn.utils.estimator_checks import check_estimator import sympy @@ -403,6 +404,20 @@ def test_best_lambda(self): for f in [self.model.predict, self.equations_.iloc[-1]["lambda_format"]]: np.testing.assert_almost_equal(f(X), y, decimal=3) + def test_all_selection_strategies(self): + equations = pd.DataFrame( + dict( + loss=[1.0, 0.1, 0.01, 0.001 * 1.4, 0.001], + score=[0.5, 1.0, 0.5, 0.5, 0.3], + ) + ) + idx_accuracy = idx_model_selection(equations, "accuracy") + self.assertEqual(idx_accuracy, 4) + idx_best = idx_model_selection(equations, "best") + self.assertEqual(idx_best, 3) + idx_score = idx_model_selection(equations, "score") + self.assertEqual(idx_score, 1) + class TestFeatureSelection(unittest.TestCase): def setUp(self): From 73d0a989bb10ece021a625cfbf706019d44b39ce Mon Sep 17 00:00:00 2001 From: MilesCranmer Date: Wed, 10 Aug 2022 14:55:53 +0300 Subject: [PATCH 40/40] Clean up docstring --- pysr/sr.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pysr/sr.py b/pysr/sr.py index 3f71b60bb..6b8e2c613 100644 --- a/pysr/sr.py +++ b/pysr/sr.py @@ -211,9 +211,9 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator): - `"accuracy"` selects the candidate model with the lowest loss (highest accuracy). - `"score"` selects the candidate model with the highest score. - Score is defined as the derivative of the log-loss with + Score is defined as the negated derivative of the log-loss with respect to complexity - if an expression has a much better - oss at a slightly higher complexity, it is preferred. + loss at a slightly higher complexity, it is preferred. - `"best"` selects the candidate model with the highest score among expressions with a loss better than at least 1.5x the most accurate model.