Skip to content

Commit

Permalink
FIX: fix oversampling bug
Browse files Browse the repository at this point in the history
  • Loading branch information
benleetownsend committed Sep 6, 2018
1 parent c8443ba commit aa4f9c2
Show file tree
Hide file tree
Showing 10 changed files with 867 additions and 134 deletions.
40 changes: 34 additions & 6 deletions enso/experiment/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
from enso.sample import sample
from enso.utils import feature_set_location, BaseObject
from enso.mode import ModeKeys
from enso.config import FEATURIZERS, DATA, EXPERIMENTS, METRICS, TEST_SETUP, RESULTS_DIRECTORY, N_GPUS, N_CORES, MODE
from enso.config import FEATURIZERS, DATA, EXPERIMENTS, METRICS, TEST_SETUP, RESULTS_DIRECTORY, N_GPUS, N_CORES, MODE, EXPERIMENT_NAME
from enso.registry import Registry, ValidateExperiments
from multiprocessing import Process

Expand Down Expand Up @@ -72,14 +72,30 @@ def run_experiments(self):
except Exception:
logging.exception("Exception occurred for {}".format(current_setting))

def experiment_has_been_run(self, current_settings):
result_path = os.path.join(RESULTS_DIRECTORY, EXPERIMENT_NAME, "Results.csv")
if not os.path.exists(result_path):
return False
results = pd.read_csv(result_path)
indexes = results["Experiment"] == current_settings["Experiment"]
for col, val in current_settings.items():
indexes = indexes & (results[col] == val)
experiments = results.loc[indexes]
if len(experiments) < len(METRICS) * TEST_SETUP["n_splits"]:
return False
return True

def _run_sub_experiment(self, experiment_cls, dataset, train, test, target, current_setting):
experiment = experiment_cls()
experiment = experiment_cls(Registry.get_resampler(current_setting["Resampler"]))

name = experiment.name()
internal_setting = {
'Experiment': name
}
internal_setting.update(current_setting)
if self.experiment_has_been_run(internal_setting):
logging.info("Experiment has been run, skipping...")
return
logging.info("Training with settings {}".format(internal_setting))
try:
# You might find yourself wondering why we're using lists here instead of np arrays
Expand All @@ -88,9 +104,7 @@ def _run_sub_experiment(self, experiment_cls, dataset, train, test, target, curr
train_labels = list(dataset[target].iloc[train])
test_set = list(dataset['Features'].iloc[test])
test_labels = list(dataset[target].iloc[test])
resampler = Registry.get_resampler(current_setting["Resampler"])
experiment.fit(*resampler.resample(train_set, train_labels))

experiment.fit(train_set, train_labels)
test_pred = experiment.predict(test_set, subset='TEST')
train_pred = experiment.predict(train_set, subset='TRAIN')
result = self._measure_experiment(
Expand Down Expand Up @@ -239,11 +253,25 @@ class Experiment(BaseObject):

__metaclass__ = VerifyOutput

def __init__(self, *args, **kwargs):
def __init__(self, resampler, auto_resample=True, *args, **kwargs):
"""
Instantiate a new experiment
"""
super().__init__(*args, **kwargs)
self.resampler_ = resampler
self.auto_resample_ = auto_resample

def resample(self, X, y):
return self.resampler_.resample(X, y)

def __getattr__(self, item):
if item == "fit" and self.auto_resample_:
def fit(X, y):
X_, y_ = self.resample(X, y)
return fit(X_, y_)
return fit
else:
return super().__getattr__(item)

@abc.abstractmethod
def fit(self, X, y):
Expand Down

0 comments on commit aa4f9c2

Please sign in to comment.