Skip to content

Commit

Permalink
* tests down from 30-45 seconds to 10-12 seconds
Browse files Browse the repository at this point in the history
* a handful of test optimizations, mostly making hyperparameter space smaller to speed up random forest tuning tests
* feature profiler tests were ~ 2 seconds and now are 70 ms
  • Loading branch information
Taylor Miller committed Nov 13, 2017
1 parent 9f2f533 commit febbc67
Show file tree
Hide file tree
Showing 5 changed files with 74 additions and 69 deletions.
3 changes: 0 additions & 3 deletions healthcareai/common/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,9 +53,6 @@ def extract_estimator_from_meta_estimator(model):
Returns:
sklearn.base.BaseEstimator:
"""
# Shufang commented, kerasclassifier is not a baseestimator
# if not issubclass(type(model), sklearn.base.BaseEstimator):
# raise HealthcareAIError('This requires an instance of sklearn.base.BaseEstimator')

if issubclass(type(model), sklearn.base.MetaEstimatorMixin) and hasattr(model, 'best_estimator_'):
result = model.best_estimator_
Expand Down
14 changes: 11 additions & 3 deletions healthcareai/tests/test_advanced_trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,8 @@

from healthcareai import AdvancedSupervisedModelTrainer
import healthcareai.tests.helpers as test_helpers
from healthcareai.common.helpers import count_unique_elements_in_column
from healthcareai.common.helpers import count_unique_elements_in_column, \
calculate_random_forest_mtry_hyperparameter
from healthcareai.common.healthcareai_error import HealthcareAIError
import healthcareai.pipelines.data_preparation as pipelines

Expand Down Expand Up @@ -130,13 +131,20 @@ def setUp(self):
self.trainer.train_test_split(random_seed=0)

def test_random_forest_no_tuning(self):
rf = self.trainer.random_forest_classifier(trees=200, randomized_search=False)
rf = self.trainer.random_forest_classifier(trees=20, randomized_search=False)
self.assertIsInstance(rf, TrainedSupervisedModel)
test_helpers.assertBetween(self, 0.8, 0.97, rf.metrics['roc_auc'])
test_helpers.assertBetween(self, 160, 180, rf.metrics['confusion_matrix'][0][0])

def test_random_forest_tuning(self):
rf = self.trainer.random_forest_classifier(randomized_search=True)
max_features = calculate_random_forest_mtry_hyperparameter(4, 'classification')
hyperparameter_grid = {'n_estimators': [10, 20, 30],
'max_features': max_features}
rf = self.trainer.random_forest_classifier(
randomized_search=True,
number_iteration_samples=2,
hyperparameter_grid=hyperparameter_grid
)
self.assertIsInstance(rf, TrainedSupervisedModel)
test_helpers.assertBetween(self, 0.7, 0.97, rf.metrics['roc_auc'])
test_helpers.assertBetween(self, 160, 180, rf.metrics['confusion_matrix'][0][0])
Expand Down
14 changes: 11 additions & 3 deletions healthcareai/tests/test_advanced_trainer_multiclass.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,8 @@

from healthcareai import AdvancedSupervisedModelTrainer
import healthcareai.tests.helpers as test_helpers
from healthcareai.common.helpers import count_unique_elements_in_column
from healthcareai.common.helpers import count_unique_elements_in_column, \
calculate_random_forest_mtry_hyperparameter
from healthcareai.common.healthcareai_error import HealthcareAIError
import healthcareai.pipelines.data_preparation as pipelines

Expand Down Expand Up @@ -69,13 +70,20 @@ def setUp(self):
self.trainer.train_test_split(random_seed=0)

def test_random_forest_no_tuning(self):
rf = self.trainer.random_forest_classifier(trees=200, randomized_search=False)
rf = self.trainer.random_forest_classifier(trees=100, randomized_search=False)
self.assertIsInstance(rf, TrainedSupervisedModel)
self.assertRaises(HealthcareAIError, rf.roc_plot)
test_helpers.assertBetween(self, 15, 30, rf.metrics['confusion_matrix'][0][0])

def test_random_forest_tuning(self):
rf = self.trainer.random_forest_classifier(randomized_search=True)
max_features = calculate_random_forest_mtry_hyperparameter(4,
'classification')
hyperparameter_grid = {'n_estimators': [10, 20, 30],
'max_features': max_features}
rf = self.trainer.random_forest_classifier(
randomized_search=True,
number_iteration_samples=2,
hyperparameter_grid=hyperparameter_grid)
self.assertIsInstance(rf, TrainedSupervisedModel)
self.assertRaises(HealthcareAIError, rf.roc_plot)
test_helpers.assertBetween(self, 15, 30, rf.metrics['confusion_matrix'][0][0])
Expand Down
111 changes: 51 additions & 60 deletions healthcareai/tests/test_feature_availability_profiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,27 +8,31 @@


class TestFeatureAvailabilityProfiler(unittest.TestCase):
def setUp(self):
self.df = pd.DataFrame(np.random.randn(1000, 4),
columns=['A', 'B', 'AdmitDTS', 'LastLoadDTS'])
def test_profiler(self):
df = pd.DataFrame(np.random.randn(1000, 4),
columns=['A', 'B', 'AdmitDTS', 'LastLoadDTS'])

# generate load date
self.df['LastLoadDTS'] = pd.datetime(2015, 5, 20)
# generate datetime objects for admit date
admit = pd.Series(1000)
delta = pd.datetime(2015, 5, 20) - pd.datetime(2015, 5, 1)
int_delta = (delta.days * 24 * 60 * 60) + delta.seconds
df['LastLoadDTS'] = pd.datetime(2015, 5, 20)

# initialize an empty 1000 length series
admit = pd.Series([0 for _ in range(1000)])

# (2015, 5, 20) - (2015, 5, 1)
delta = timedelta(days=19)
for i in range(1000):
random_second = randrange(int_delta)
admit[i] = pd.datetime(2015, 5, 1) + timedelta(seconds=random_second)
self.df['AdmitDTS'] = admit.astype('datetime64[ns]')
random_sec = randrange(delta.total_seconds())
admit[i] = pd.datetime(2015, 5, 1) + timedelta(seconds=random_sec)

df['AdmitDTS'] = admit.astype('datetime64[ns]')

# add nulls
a = np.random.rand(1000) > .5
self.df.loc[a, ['A']] = np.nan
df.loc[a, ['A']] = np.nan
a = np.random.rand(1000) > .75
self.df.loc[a, ['B']] = np.nan
df.loc[a, ['B']] = np.nan

def runTest(self):
df_out = feature_availability_profiler(data_frame=self.df,
df_out = feature_availability_profiler(data_frame=df,
admit_col_name='AdmitDTS',
last_load_col_name='LastLoadDTS',
plot_flag=False,
Expand All @@ -37,71 +41,58 @@ def runTest(self):
self.assertTrue(65 < df_out.iloc[-1, 1] < 85)
self.assertTrue(40 < df_out.iloc[-1, 0] < 60)

def tearDown(self):
del self.df


class TestFeatureAvailabilityProfilerError1(unittest.TestCase):
def setUp(self):
self.df = pd.DataFrame(np.random.randn(1000, 4),
def test_error_1(self):
df = pd.DataFrame(np.random.randn(1000, 4),
columns=['A', 'B', 'AdmitDTS',
'LastLoadDTS'])

def runTest(self):
with self.assertRaises(HealthcareAIError) as error:
dfOut = feature_availability_profiler(data_frame=self.df,
admit_col_name='AdmitDTS',
last_load_col_name='LastLoadDTS',
plot_flag=False,
list_flag=False)
self.assertEqual('Admit Date column is not a date type.', error.exception.message)
feature_availability_profiler(data_frame=df,
admit_col_name='AdmitDTS',
last_load_col_name='LastLoadDTS',
plot_flag=False,
list_flag=False)
self.assertEqual(
'Admit Date column is not a date type.',
error.exception.message)


class TestFeatureAvailabilityProfilerError2(unittest.TestCase):
def setUp(self):
self.df = pd.DataFrame(np.random.randn(1000, 4),
columns=['A', 'B', 'AdmitDTS',
'LastLoadDTS'])
def test_error_2(self):
df = pd.DataFrame(np.random.randn(1000, 4),
columns=['A', 'B', 'AdmitDTS',
'LastLoadDTS'])

self.df['AdmitDTS'] = pd.datetime(2015, 5, 20)
df['AdmitDTS'] = pd.datetime(2015, 5, 20)

def runTest(self):
with self.assertRaises(HealthcareAIError) as error:
df_out = feature_availability_profiler(data_frame=self.df,
admit_col_name='AdmitDTS',
last_load_col_name='LastLoadDTS',
plot_flag=False,
list_flag=False)
feature_availability_profiler(data_frame=df,
admit_col_name='AdmitDTS',
last_load_col_name='LastLoadDTS',
plot_flag=False,
list_flag=False)
self.assertEqual('Last Load Date column is not a date type.',
error.exception.message)


class TestFeatureAvailabilityProfilerError3(unittest.TestCase):
def setUp(self):
self.df = pd.DataFrame(np.random.randn(1000, 2),
columns=['AdmitDTS',
'LastLoadDTS'])
def test_for_error_3(self):
df = pd.DataFrame(np.random.randn(1000, 2),
columns=['AdmitDTS',
'LastLoadDTS'])
# generate load date
self.df['LastLoadDTS'] = pd.datetime(2015, 5, 20)
df['LastLoadDTS'] = pd.datetime(2015, 5, 20)

# generate datetime objects for admit date
admit = pd.Series(1000)
delta = pd.datetime(2015, 5, 20) - pd.datetime(2015, 5, 1)
int_delta = (delta.days * 24 * 60 * 60) + delta.seconds
for i in range(1000):
random_second = randrange(int_delta)
admit[i] = pd.datetime(2015, 5, 1) + timedelta(
seconds=random_second)
self.df['AdmitDTS'] = admit.astype('datetime64[ns]')
df['AdmitDTS'] = pd.Series(1000).astype('datetime64[ns]')

def runTest(self):
with self.assertRaises(HealthcareAIError) as error:
df_out = feature_availability_profiler(data_frame=self.df,
admit_col_name='AdmitDTS',
last_load_col_name='LastLoadDTS',
plot_flag=False,
list_flag=False)
feature_availability_profiler(data_frame=df,
admit_col_name='AdmitDTS',
last_load_col_name='LastLoadDTS',
plot_flag=False,
list_flag=False)
self.assertEqual('Dataframe must be at least 3 columns.',
error.exception.message)

def tearDown(self):
del self.df
1 change: 1 addition & 0 deletions healthcareai/tests/test_trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,7 @@ def test_random_forest_regression(self):

result = trained_rf_regressor.metrics

self.assertIsInstance(trained_rf_regressor, TrainedSupervisedModel)
helpers.assertBetween(self, 350, 750, result['mean_squared_error'])
helpers.assertBetween(self, 10, 25, result['mean_absolute_error'])

Expand Down

0 comments on commit febbc67

Please sign in to comment.