diff --git a/pyproject.toml b/pyproject.toml index f2c465e1a..72cb167fc 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -34,4 +34,5 @@ profile = "black" dev-dependencies = [ "pre-commit>=3.7.0", "ipython>=8.23.0", + "mypy>=1.10.0", ] diff --git a/pysr/export_sympy.py b/pysr/export_sympy.py index eeb504719..314a2cbd2 100644 --- a/pysr/export_sympy.py +++ b/pysr/export_sympy.py @@ -5,6 +5,8 @@ import sympy from sympy import sympify +from .utils import ArrayLike + sympy_mappings = { "div": lambda x, y: x / y, "mult": lambda x, y: x * y, @@ -30,8 +32,8 @@ "acosh": lambda x: sympy.acosh(x), "acosh_abs": lambda x: sympy.acosh(abs(x) + 1), "asinh": sympy.asinh, - "atanh": lambda x: sympy.atanh(sympy.Mod(x + 1, 2) - 1), - "atanh_clip": lambda x: sympy.atanh(sympy.Mod(x + 1, 2) - 1), + "atanh": lambda x: sympy.atanh(sympy.Mod(x + 1, 2) - sympy.S(1)), + "atanh_clip": lambda x: sympy.atanh(sympy.Mod(x + 1, 2) - sympy.S(1)), "abs": abs, "mod": sympy.Mod, "erf": sympy.erf, @@ -60,13 +62,13 @@ def create_sympy_symbols_map( - feature_names_in: List[str], + feature_names_in: ArrayLike[str], ) -> Dict[str, sympy.Symbol]: return {variable: sympy.Symbol(variable) for variable in feature_names_in} def create_sympy_symbols( - feature_names_in: List[str], + feature_names_in: ArrayLike[str], ) -> List[sympy.Symbol]: return [sympy.Symbol(variable) for variable in feature_names_in] @@ -74,7 +76,7 @@ def create_sympy_symbols( def pysr2sympy( equation: str, *, - feature_names_in: Optional[List[str]] = None, + feature_names_in: Optional[ArrayLike[str]] = None, extra_sympy_mappings: Optional[Dict[str, Callable]] = None, ): if feature_names_in is None: diff --git a/pysr/julia_import.py b/pysr/julia_import.py index dc881bf21..50a58fa24 100644 --- a/pysr/julia_import.py +++ b/pysr/julia_import.py @@ -1,6 +1,7 @@ import os import sys import warnings +from typing import Any # Check if JuliaCall is already loaded, and if so, warn the user # about the relevant environment variables. If not loaded, @@ -37,6 +38,9 @@ from juliacall import Main as jl # type: ignore +jl: Any = jl # type: ignore + + jl_version = (jl.VERSION.major, jl.VERSION.minor, jl.VERSION.patch) # Next, automatically load the juliacall extension if we're in a Jupyter notebook diff --git a/pysr/sr.py b/pysr/sr.py index cb3a05c66..f1a254fd4 100644 --- a/pysr/sr.py +++ b/pysr/sr.py @@ -21,9 +21,12 @@ import numpy as np import pandas as pd +from numpy import ndarray +from numpy.typing import NDArray from sklearn.base import BaseEstimator, MultiOutputMixin, RegressorMixin from sklearn.utils import check_array, check_consistent_length, check_random_state -from sklearn.utils.validation import _check_feature_names_in, check_is_fitted +from sklearn.utils.validation import _check_feature_names_in # type: ignore +from sklearn.utils.validation import check_is_fitted from .denoising import denoise, multi_denoise from .deprecated import DEPRECATED_KWARGS @@ -44,6 +47,7 @@ ) from .julia_import import SymbolicRegression, jl from .utils import ( + ArrayLike, _csv_filename_to_pkl_filename, _preprocess_julia_floats, _safe_check_feature_names_in, @@ -603,22 +607,17 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator): Units of each variable in the training dataset, `y`. nout_ : int Number of output dimensions. - selection_mask_ : list[int] of length `select_k_features` - List of indices for input features that are selected when - `select_k_features` is set. + selection_mask_ : ndarray of shape (`n_features_in_`,) + Mask of which features of `X` to use when `select_k_features` is set. tempdir_ : Path Path to the temporary equations directory. - equation_file_ : str + equation_file_ : Union[str, Path] Output equation file name produced by the julia backend. julia_state_stream_ : ndarray The serialized state for the julia SymbolicRegression.jl backend (after fitting), stored as an array of uint8, produced by Julia's Serialization.serialize function. - julia_state_ - The deserialized state. julia_options_stream_ : ndarray The serialized julia options, stored as an array of uint8, - julia_options_ - The deserialized julia options. equation_file_contents_ : list[pandas.DataFrame] Contents of the equation file output by the Julia backend. show_pickle_warnings_ : bool @@ -665,6 +664,21 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator): ``` """ + equations_: Optional[Union[pd.DataFrame, List[pd.DataFrame]]] + n_features_in_: int + feature_names_in_: ArrayLike[str] + display_feature_names_in_: ArrayLike[str] + X_units_: Optional[ArrayLike[str]] + y_units_: Optional[Union[str, ArrayLike[str]]] + nout_: int + selection_mask_: Optional[NDArray[np.bool_]] + tempdir_: Path + equation_file_: Union[str, Path] + julia_state_stream_: Optional[NDArray[np.uint8]] + julia_options_stream_: Optional[NDArray[np.uint8]] + equation_file_contents_: Optional[List[pd.DataFrame]] + show_pickle_warnings_: bool + def __init__( self, model_selection: Literal["best", "accuracy", "score"] = "best", @@ -926,7 +940,7 @@ def from_file( Names of the features passed to the model. Not needed if loading from a pickle file. selection_mask : list[bool] - If using select_k_features, you must pass `model.selection_mask_` here. + If using `select_k_features`, you must pass `model.selection_mask_` here. Not needed if loading from a pickle file. nout : int Number of outputs of the model. @@ -1024,7 +1038,7 @@ def __repr__(self): all_equations = equations for i, equations in enumerate(all_equations): - selected = ["" for _ in range(len(equations))] + selected = pd.Series([""] * len(equations), index=equations.index) chosen_row = idx_model_selection(equations, self.model_selection) selected[chosen_row] = ">>>>" repr_equations = pd.DataFrame( @@ -1124,10 +1138,12 @@ def equations(self): # pragma: no cover @property def julia_options_(self): + """The deserialized julia options.""" return jl_deserialize(self.julia_options_stream_) @property def julia_state_(self): + """The deserialized state.""" return jl_deserialize(self.julia_state_stream_) @property @@ -1140,7 +1156,7 @@ def raw_julia_state_(self): ) return self.julia_state_ - def get_best(self, index=None): + def get_best(self, index=None) -> Union[pd.Series, List[pd.Series]]: """ Get best equation using `model_selection`. @@ -1163,8 +1179,6 @@ def get_best(self, index=None): Raised when an invalid model selection strategy is provided. """ check_is_fitted(self, attributes=["equations_"]) - if self.equations_ is None: - raise ValueError("No equations have been generated yet.") if index is not None: if isinstance(self.equations_, list): @@ -1172,16 +1186,22 @@ def get_best(self, index=None): index, list ), "With multiple output features, index must be a list." return [eq.iloc[i] for eq, i in zip(self.equations_, index)] - return self.equations_.iloc[index] + elif isinstance(self.equations_, pd.DataFrame): + return self.equations_.iloc[index] + else: + raise ValueError("No equations have been generated yet.") if isinstance(self.equations_, list): return [ - eq.iloc[idx_model_selection(eq, self.model_selection)] + eq.loc[idx_model_selection(eq, self.model_selection)] for eq in self.equations_ ] - return self.equations_.iloc[ - idx_model_selection(self.equations_, self.model_selection) - ] + elif isinstance(self.equations_, pd.DataFrame): + return self.equations_.loc[ + idx_model_selection(self.equations_, self.model_selection) + ] + else: + raise ValueError("No equations have been generated yet.") def _setup_equation_file(self): """ @@ -1316,7 +1336,15 @@ def _validate_and_set_init_params(self): def _validate_and_set_fit_params( self, X, y, Xresampled, weights, variable_names, X_units, y_units - ): + ) -> Tuple[ + ndarray, + ndarray, + Optional[ndarray], + Optional[ndarray], + ndarray, + Optional[ArrayLike[str]], + Optional[Union[str, ArrayLike[str]]], + ]: """ Validate the parameters passed to the :term`fit` method. @@ -1336,7 +1364,7 @@ def _validate_and_set_fit_params( Weight array of the same shape as `y`. Each element is how to weight the mean-square-error loss for that particular element of y. - variable_names : list[str] of length n_features + variable_names : ndarray of length n_features Names of each variable in the training dataset, `X`. X_units : list[str] of length n_features Units of each variable in the training dataset, `X`. @@ -1392,7 +1420,7 @@ def _validate_and_set_fit_params( if weights is not None: weights = check_array(weights, ensure_2d=False) check_consistent_length(weights, y) - X, y = self._validate_data(X=X, y=y, reset=True, multi_output=True) + X, y = self._validate_data_X_y(X, y) self.feature_names_in_ = _safe_check_feature_names_in( self, variable_names, generate_names=False ) @@ -1402,10 +1430,10 @@ def _validate_and_set_fit_params( self.display_feature_names_in_ = np.array( [f"x{_subscriptify(i)}" for i in range(X.shape[1])] ) + variable_names = self.feature_names_in_ else: self.display_feature_names_in_ = self.feature_names_in_ - - variable_names = self.feature_names_in_ + variable_names = self.feature_names_in_ # Handle multioutput data if len(y.shape) == 1 or (len(y.shape) == 2 and y.shape[1] == 1): @@ -1420,6 +1448,12 @@ def _validate_and_set_fit_params( return X, y, Xresampled, weights, variable_names, X_units, y_units + def _validate_data_X_y(self, X, y) -> Tuple[ndarray, ndarray]: + return self._validate_data(X=X, y=y, reset=True, multi_output=True) # type: ignore + + def _validate_data_X(self, X) -> Tuple[ndarray]: + return self._validate_data(X=X, reset=False) # type: ignore + def _pre_transform_training_data( self, X, y, Xresampled, variable_names, X_units, y_units, random_state ): @@ -1489,7 +1523,7 @@ def _pre_transform_training_data( self.X_units_ = copy.deepcopy(X_units) # Re-perform data validation and feature name updating - X, y = self._validate_data(X=X, y=y, reset=True, multi_output=True) + X, y = self._validate_data_X_y(X, y) # Update feature names with selected variable names self.feature_names_in_ = _check_feature_names_in(self, variable_names) self.display_feature_names_in_ = self.feature_names_in_ @@ -1506,7 +1540,7 @@ def _pre_transform_training_data( return X, y, variable_names, X_units, y_units - def _run(self, X, y, mutated_params, weights, seed): + def _run(self, X, y, mutated_params, weights, seed: int): """ Run the symbolic regression fitting process on the julia backend. @@ -1784,9 +1818,9 @@ def fit( y, Xresampled=None, weights=None, - variable_names: Optional[List[str]] = None, - X_units: Optional[List[str]] = None, - y_units: Optional[List[str]] = None, + variable_names: Optional[ArrayLike[str]] = None, + X_units: Optional[ArrayLike[str]] = None, + y_units: Optional[Union[str, ArrayLike[str]]] = None, ) -> "PySRRegressor": """ Search for equations to fit the dataset and store them in `self.equations_`. @@ -1848,9 +1882,6 @@ def fit( self.X_units_ = None self.y_units_ = None - random_state = check_random_state(self.random_state) # For np random - seed = random_state.get_state()[1][0] # For julia random - self._setup_equation_file() mutated_params = self._validate_and_set_init_params() @@ -1878,6 +1909,9 @@ def fit( "More datapoints will lower the search speed." ) + random_state = check_random_state(self.random_state) # For np random + seed = random_state.randint(0, 2**31 - 1) # For julia random + # Pre transformations (feature selection and denoising) X, y, variable_names, X_units, y_units = self._pre_transform_training_data( X, y, Xresampled, variable_names, X_units, y_units, random_state @@ -1928,7 +1962,7 @@ def fit( return self - def refresh(self, checkpoint_file=None): + def refresh(self, checkpoint_file=None) -> None: """ Update self.equations_ with any new options passed. @@ -2003,14 +2037,16 @@ def predict(self, X, index=None): # reordered/reindexed to match those of the transformed (denoised and # feature selected) X in fit. X = X.reindex(columns=self.feature_names_in_) - X = self._validate_data(X, reset=False) + X = self._validate_data_X(X) try: - if self.nout_ > 1: + if isinstance(best_equation, list): + assert self.nout_ > 1 return np.stack( [eq["lambda_format"](X) for eq in best_equation], axis=1 ) - return best_equation["lambda_format"](X) + else: + return best_equation["lambda_format"](X) except Exception as error: raise ValueError( "Failed to evaluate the expression. " @@ -2040,9 +2076,11 @@ def sympy(self, index=None): """ self.refresh() best_equation = self.get_best(index=index) - if self.nout_ > 1: + if isinstance(best_equation, list): + assert self.nout_ > 1 return [eq["sympy_format"] for eq in best_equation] - return best_equation["sympy_format"] + else: + return best_equation["sympy_format"] def latex(self, index=None, precision=3): """ @@ -2102,9 +2140,11 @@ def jax(self, index=None): self.set_params(output_jax_format=True) self.refresh() best_equation = self.get_best(index=index) - if self.nout_ > 1: + if isinstance(best_equation, list): + assert self.nout_ > 1 return [eq["jax_format"] for eq in best_equation] - return best_equation["jax_format"] + else: + return best_equation["jax_format"] def pytorch(self, index=None): """ @@ -2132,9 +2172,10 @@ def pytorch(self, index=None): self.set_params(output_torch_format=True) self.refresh() best_equation = self.get_best(index=index) - if self.nout_ > 1: + if isinstance(best_equation, list): return [eq["torch_format"] for eq in best_equation] - return best_equation["torch_format"] + else: + return best_equation["torch_format"] def _read_equation_file(self): """Read the hall of fame file created by `SymbolicRegression.jl`.""" @@ -2233,10 +2274,8 @@ def get_hof(self): lastComplexity = 0 sympy_format = [] lambda_format = [] - if self.output_jax_format: - jax_format = [] - if self.output_torch_format: - torch_format = [] + jax_format = [] + torch_format = [] for _, eqn_row in output.iterrows(): eqn = pysr2sympy( @@ -2348,7 +2387,7 @@ def latex_table( """ self.refresh() - if self.nout_ > 1: + if isinstance(self.equations_, list): if indices is not None: assert isinstance(indices, list) assert isinstance(indices[0], list) @@ -2357,7 +2396,7 @@ def latex_table( table_string = sympy2multilatextable( self.equations_, indices=indices, precision=precision, columns=columns ) - else: + elif isinstance(self.equations_, pd.DataFrame): if indices is not None: assert isinstance(indices, list) assert isinstance(indices[0], int) @@ -2365,6 +2404,11 @@ def latex_table( table_string = sympy2latextable( self.equations_, indices=indices, precision=precision, columns=columns ) + else: + raise ValueError( + "Invalid type for equations_ to pass to `latex_table`. " + "Expected a DataFrame or a list of DataFrames." + ) preamble_string = [ r"\usepackage{breqn}", diff --git a/pysr/utils.py b/pysr/utils.py index ca000aae7..91fdffc0e 100644 --- a/pysr/utils.py +++ b/pysr/utils.py @@ -1,10 +1,16 @@ import os import re +from pathlib import Path +from typing import Any, List, TypeVar, Union -from sklearn.utils.validation import _check_feature_names_in +from numpy import ndarray +from sklearn.utils.validation import _check_feature_names_in # type: ignore +T = TypeVar("T", bound=Any) +ArrayLike = Union[ndarray, List[T]] -def _csv_filename_to_pkl_filename(csv_filename: str) -> str: + +def _csv_filename_to_pkl_filename(csv_filename: Union[str, Path]) -> Union[str, Path]: if os.path.splitext(csv_filename)[1] == ".pkl": return csv_filename