Skip to content

Commit

Permalink
Merge 175b024 into a15823e
Browse files Browse the repository at this point in the history
  • Loading branch information
MilesCranmer committed Aug 10, 2022
2 parents a15823e + 175b024 commit 3c391f6
Show file tree
Hide file tree
Showing 2 changed files with 57 additions and 23 deletions.
65 changes: 42 additions & 23 deletions pysr/sr.py
Expand Up @@ -205,10 +205,18 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
Parameters
----------
model_selection : str, default="best"
Model selection criterion. Can be 'accuracy' or 'best'.
`"accuracy"` selects the candidate model with the lowest loss
(highest accuracy). `"best"` selects the candidate model with
the lowest sum of normalized loss and complexity.
Model selection criterion when selecting a final expression from
the list of best expression at each complexity.
Can be 'accuracy', 'best', or 'score'.
- `"accuracy"` selects the candidate model with the lowest loss
(highest accuracy).
- `"score"` selects the candidate model with the highest score.
Score is defined as the derivative of the log-loss with
respect to complexity - if an expression has a much better
oss at a slightly higher complexity, it is preferred.
- `"best"` selects the candidate model with the highest score
among expressions with a loss better than at least 1.5x the
most accurate model.
binary_operators : list[str], default=["+", "-", "*", "/"]
List of strings giving the binary operators in Julia's Base.
Expand Down Expand Up @@ -469,7 +477,7 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
Whether to use a progress bar instead of printing to stdout.
equation_file : str, default=None
Where to save the files (with `.csv` extension).
Where to save the files (.csv extension).
temp_equation_file : bool, default=False
Whether to put the hall of fame file in the temp directory.
Expand Down Expand Up @@ -943,12 +951,7 @@ def __repr__(self):

for i, equations in enumerate(all_equations):
selected = ["" for _ in range(len(equations))]
if self.model_selection == "accuracy":
chosen_row = -1
elif self.model_selection == "best":
chosen_row = equations["score"].idxmax()
else:
raise NotImplementedError
chosen_row = idx_model_selection(equations, self.model_selection)
selected[chosen_row] = ">>>>"
repr_equations = pd.DataFrame(
dict(
Expand Down Expand Up @@ -1091,18 +1094,14 @@ def get_best(self, index=None):
return [eq.iloc[i] for eq, i in zip(self.equations_, index)]
return self.equations_.iloc[index]

if self.model_selection == "accuracy":
if isinstance(self.equations_, list):
return [eq.iloc[-1] for eq in self.equations_]
return self.equations_.iloc[-1]
elif self.model_selection == "best":
if isinstance(self.equations_, list):
return [eq.iloc[eq["score"].idxmax()] for eq in self.equations_]
return self.equations_.iloc[self.equations_["score"].idxmax()]
else:
raise NotImplementedError(
f"{self.model_selection} is not a valid model selection strategy."
)
if isinstance(self.equations_, list):
return [
eq.iloc[idx_model_selection(eq, self.model_selection)]
for eq in self.equations_
]
return self.equations_.iloc[
idx_model_selection(self.equations_, self.model_selection)
]

def _setup_equation_file(self):
"""
Expand Down Expand Up @@ -2149,6 +2148,26 @@ def get_hof(self):
return ret_outputs[0]


def idx_model_selection(equations: pd.DataFrame, model_selection: str) -> int:
"""
Return the index of the selected expression, given a dataframe of
equations and a model selection.
"""
if model_selection == "accuracy":
chosen_idx = equations["loss"].idxmin()
elif model_selection == "best":
threshold = 1.5 * equations["loss"].min()
filtered_equations = equations.query(f"loss <= {threshold}")
chosen_idx = filtered_equations["score"].idxmax()
elif model_selection == "score":
chosen_idx = equations["score"].idxmax()
else:
raise NotImplementedError(
f"{model_selection} is not a valid model selection strategy."
)
return chosen_idx


def _denoise(X, y, Xresampled=None, random_state=None):
"""Denoise the dataset using a Gaussian process"""
from sklearn.gaussian_process import GaussianProcessRegressor
Expand Down
15 changes: 15 additions & 0 deletions test/test.py
Expand Up @@ -9,6 +9,7 @@
run_feature_selection,
_handle_feature_selection,
_csv_filename_to_pkl_filename,
idx_model_selection,
)
from sklearn.utils.estimator_checks import check_estimator
import sympy
Expand Down Expand Up @@ -403,6 +404,20 @@ def test_best_lambda(self):
for f in [self.model.predict, self.equations_.iloc[-1]["lambda_format"]]:
np.testing.assert_almost_equal(f(X), y, decimal=3)

def test_all_selection_strategies(self):
equations = pd.DataFrame(
dict(
loss=[1.0, 0.1, 0.01, 0.001 * 1.4, 0.001],
score=[0.5, 1.0, 0.5, 0.5, 0.3],
)
)
idx_accuracy = idx_model_selection(equations, "accuracy")
self.assertEqual(idx_accuracy, 4)
idx_best = idx_model_selection(equations, "best")
self.assertEqual(idx_best, 3)
idx_score = idx_model_selection(equations, "score")
self.assertEqual(idx_score, 1)


class TestFeatureSelection(unittest.TestCase):
def setUp(self):
Expand Down

0 comments on commit 3c391f6

Please sign in to comment.