diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 5315fb82..ab56c1be 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -34,8 +34,8 @@ jobs: wget --quiet https://data.nrel.gov/system/files/156/BuildStock_TMY3_FIPS.zip - name: Download and Install OpenStudio run: | - wget -q https://github.com/NREL/OpenStudio/releases/download/v3.5.1/OpenStudio-3.5.1+22e1db7be5-Ubuntu-20.04.deb - sudo apt install -y ./OpenStudio-3.5.1+22e1db7be5-Ubuntu-20.04.deb + wget -q https://github.com/NREL/OpenStudio/releases/download/v3.6.1/OpenStudio-3.6.1+bb9481519e-Ubuntu-20.04-x86_64.deb + sudo apt install -y ./OpenStudio-3.6.1+bb9481519e-Ubuntu-20.04-x86_64.deb openstudio openstudio_version which openstudio - name: Install buildstockbatch diff --git a/.gitignore b/.gitignore index d554b88a..0ccea3c4 100644 --- a/.gitignore +++ b/.gitignore @@ -15,3 +15,4 @@ coverage/ .coverage build/ .env +.history \ No newline at end of file diff --git a/buildstockbatch/aws/aws.py b/buildstockbatch/aws/aws.py index f2521d9d..c9064303 100644 --- a/buildstockbatch/aws/aws.py +++ b/buildstockbatch/aws/aws.py @@ -26,7 +26,6 @@ import logging import math import os -import pandas as pd import pathlib import random from s3fs import S3FileSystem @@ -42,7 +41,7 @@ from buildstockbatch.base import ValidationError, BuildStockBatchBase from buildstockbatch.aws.awsbase import AwsJobBase from buildstockbatch import postprocessing -from buildstockbatch.utils import ContainerRuntime, log_error_details, get_project_configuration +from buildstockbatch.utils import ContainerRuntime, log_error_details, get_project_configuration, read_csv logger = logging.getLogger(__name__) @@ -1858,7 +1857,8 @@ def run_batch(self): json.dump(self.cfg, f) # Collect simulations to queue - df = pd.read_csv(buildstock_csv_filename, index_col=0) + df = read_csv(buildstock_csv_filename, index_col=0, dtype=str) + self.validate_buildstock_csv(self.project_filename, df) building_ids = df.index.tolist() n_datapoints = len(building_ids) n_sims = n_datapoints * (len(self.cfg.get('upgrades', [])) + 1) diff --git a/buildstockbatch/base.py b/buildstockbatch/base.py index 5ec1c690..b646267d 100644 --- a/buildstockbatch/base.py +++ b/buildstockbatch/base.py @@ -16,7 +16,6 @@ import logging from lxml import objectify import os -import pandas as pd import numpy as np import re import requests @@ -36,7 +35,7 @@ postprocessing ) from buildstockbatch.exc import SimulationExists, ValidationError -from buildstockbatch.utils import path_rel_to_file, get_project_configuration +from buildstockbatch.utils import path_rel_to_file, get_project_configuration, read_csv from buildstockbatch.__version__ import __version__ as bsb_version logger = logging.getLogger(__name__) @@ -192,7 +191,7 @@ def cleanup_sim_dir(sim_dir, dest_fs, simout_ts_dir, upgrade_id, building_id): timeseries_filepath = os.path.join(sim_dir, 'run', 'results_timeseries.csv') # FIXME: Allowing both names here for compatibility. Should consolidate on one timeseries filename. if os.path.isfile(timeseries_filepath): - units_dict = pd.read_csv(timeseries_filepath, nrows=1).transpose().to_dict()[0] + units_dict = read_csv(timeseries_filepath, nrows=1).transpose().to_dict()[0] skiprows = [1] else: timeseries_filepath = os.path.join(sim_dir, 'run', 'enduse_timeseries.csv') @@ -208,15 +207,15 @@ def cleanup_sim_dir(sim_dir, dest_fs, simout_ts_dir, upgrade_id, building_id): if os.path.isfile(timeseries_filepath): # Find the time columns present in the enduse_timeseries file possible_time_cols = ['time', 'Time', 'TimeDST', 'TimeUTC'] - cols = pd.read_csv(timeseries_filepath, index_col=False, nrows=0).columns.tolist() + cols = read_csv(timeseries_filepath, index_col=False, nrows=0).columns.tolist() actual_time_cols = [c for c in cols if c in possible_time_cols] if not actual_time_cols: logger.error(f'Did not find any time column ({possible_time_cols}) in {timeseries_filepath}.') raise RuntimeError(f'Did not find any time column ({possible_time_cols}) in {timeseries_filepath}.') - tsdf = pd.read_csv(timeseries_filepath, parse_dates=actual_time_cols, skiprows=skiprows) + tsdf = read_csv(timeseries_filepath, parse_dates=actual_time_cols, skiprows=skiprows) if os.path.isfile(schedules_filepath): - schedules = pd.read_csv(schedules_filepath, dtype=np.float64) + schedules = read_csv(schedules_filepath, dtype=np.float64) schedules.rename(columns=lambda x: f'schedules_{x}', inplace=True) schedules['TimeDST'] = tsdf['Time'] tsdf = tsdf.merge(schedules, how='left', on='TimeDST') @@ -302,7 +301,7 @@ def validate_openstudio_path(cls, project_file): if os_sha != actual_os_sha: raise ValidationError( f"OpenStudio version is correct at {os_version}, but the shas don't match. " - "Got {actual_os_sha}, expected {os_sha}" + f"Got {actual_os_sha}, expected {os_sha}" ) return True @@ -315,7 +314,38 @@ def validate_sampler(project_file): except AttributeError: raise ValidationError(f'Sampler class `{sampler_name}` is not available.') args = cfg['sampler']['args'] - return Sampler.validate_args(project_file, **args) + Sampler.validate_args(project_file, **args) + if issubclass(Sampler, sampler.PrecomputedSampler): + sample_file = cfg['sampler']['args']['sample_file'] + if not os.path.isabs(sample_file): + sample_file = os.path.join(os.path.dirname(project_file), sample_file) + else: + sample_file = os.path.abspath(sample_file) + buildstock_df = read_csv(sample_file, dtype=str) + BuildStockBatchBase.validate_buildstock_csv(project_file, buildstock_df) + return True + + + @staticmethod + def validate_buildstock_csv(project_file, buildstock_df): + param_option_dict, _ = BuildStockBatchBase.get_param_option_dict(project_file) + # verify that all the Columns in buildstock_df only have values available in param_option_dict + # param_option_dict has format: {column_name: [valid_option1, valid_option2, ...], ...} + errors = [] + for column in buildstock_df.columns: + if column in {'Building'}: + continue + if column not in param_option_dict: + errors.append(f'Column {column} in buildstock_csv is not available in options_lookup.tsv') + continue + for option in buildstock_df[column].unique(): + if option not in param_option_dict[column]: + errors.append(f'Option {option} in column {column} of buildstock_csv is not available ' + f'in options_lookup.tsv') + if errors: + raise ValidationError('\n'.join(errors)) + + return True @classmethod def validate_workflow_generator(cls, project_file): diff --git a/buildstockbatch/eagle.py b/buildstockbatch/eagle.py index 278ff467..2a90cd5f 100644 --- a/buildstockbatch/eagle.py +++ b/buildstockbatch/eagle.py @@ -41,7 +41,8 @@ get_error_details, ContainerRuntime, path_rel_to_file, - get_project_configuration + get_project_configuration, + read_csv ) from buildstockbatch import postprocessing from buildstockbatch.__version__ import __version__ as bsb_version @@ -91,7 +92,8 @@ def validate_output_directory_eagle(cls, project_file): cfg = get_project_configuration(project_file) output_dir = path_rel_to_file(project_file, cfg['output_directory']) if not (output_dir.startswith('/scratch') or output_dir.startswith('/projects')): - raise ValidationError(f"`output_directory` must be in /scratch or /projects, `output_directory` = {output_dir}") + raise ValidationError(f"`output_directory` must be in /scratch or /projects," + f" `output_directory` = {output_dir}") @property def output_dir(self): @@ -196,7 +198,8 @@ def run_batch(self, sampling_only=False): return # Determine the number of simulations expected to be executed - df = pd.read_csv(buildstock_csv_filename, index_col=0) + df = read_csv(buildstock_csv_filename, index_col=0, dtype=str) + self.validate_buildstock_csv(self.project_filename, df) # find out how many buildings there are to simulate building_ids = df.index.tolist() diff --git a/buildstockbatch/local.py b/buildstockbatch/local.py index a26ed6c2..c001977d 100644 --- a/buildstockbatch/local.py +++ b/buildstockbatch/local.py @@ -21,7 +21,6 @@ import json import logging import os -import pandas as pd import pathlib import re import shutil @@ -30,7 +29,7 @@ from buildstockbatch.base import BuildStockBatchBase, SimulationExists from buildstockbatch import postprocessing -from buildstockbatch.utils import log_error_details, ContainerRuntime +from buildstockbatch.utils import log_error_details, ContainerRuntime, read_csv from buildstockbatch.__version__ import __version__ as bsb_version logger = logging.getLogger(__name__) @@ -232,7 +231,9 @@ def run_batch(self, n_jobs=None, measures_only=False, sampling_only=False): shutil.copytree(buildstock_path / "resources", lib_path / "resources") shutil.copytree(project_path / "housing_characteristics", lib_path / "housing_characteristics") - df = pd.read_csv(buildstock_csv_filename, index_col=0) + df = read_csv(buildstock_csv_filename, index_col=0, dtype=str) + self.validate_buildstock_csv(self.project_filename, df) + building_ids = df.index.tolist() n_datapoints = len(building_ids) run_building_d = functools.partial( diff --git a/buildstockbatch/sampler/commercial_sobol.py b/buildstockbatch/sampler/commercial_sobol.py index 639c2f62..7e1ac4b8 100644 --- a/buildstockbatch/sampler/commercial_sobol.py +++ b/buildstockbatch/sampler/commercial_sobol.py @@ -21,7 +21,7 @@ from .sobol_lib import i4_sobol_generate from .base import BuildStockSampler -from buildstockbatch.utils import ContainerRuntime +from buildstockbatch.utils import ContainerRuntime, read_csv from buildstockbatch.exc import ValidationError logger = logging.getLogger(__name__) @@ -81,7 +81,7 @@ def run_sampling(self): tsv_hash = {} for tsv_file in os.listdir(self.buildstock_dir): if '.tsv' in tsv_file: - tsv_df = pd.read_csv(os.path.join(self.buildstock_dir, tsv_file), sep='\t') + tsv_df = read_csv(os.path.join(self.buildstock_dir, tsv_file), sep='\t') dependency_columns = [item for item in list(tsv_df) if 'Dependency=' in item] tsv_df[dependency_columns] = tsv_df[dependency_columns].astype('str') tsv_hash[tsv_file.replace('.tsv', '')] = tsv_df diff --git a/buildstockbatch/sampler/downselect.py b/buildstockbatch/sampler/downselect.py index 575fb3e8..e71578c1 100644 --- a/buildstockbatch/sampler/downselect.py +++ b/buildstockbatch/sampler/downselect.py @@ -12,11 +12,11 @@ import math import numpy as np import os -import pandas as pd import shutil from .base import BuildStockSampler from buildstockbatch.exc import ValidationError +from buildstockbatch.utils import read_csv logger = logging.getLogger(__name__) @@ -107,7 +107,7 @@ def run_sampling(self): n_samples_init = 350000 init_sampler = self.SUB_SAMPLER_CLASS(self.parent(), n_datapoints=n_samples_init, **self.sub_kw) buildstock_csv_filename = init_sampler.run_sampling() - df = pd.read_csv(buildstock_csv_filename, index_col=0) + df = read_csv(buildstock_csv_filename, index_col=0, dtype=str) df_new = df[self.downselect_logic(df, self.logic)] downselected_n_samples_init = df_new.shape[0] n_samples = math.ceil(self.n_datapoints * n_samples_init / downselected_n_samples_init) @@ -120,7 +120,7 @@ def run_sampling(self): with gzip.open(os.path.splitext(buildstock_csv_filename)[0] + '_orig.csv.gz', 'wb') as f_out: with open(buildstock_csv_filename, 'rb') as f_in: shutil.copyfileobj(f_in, f_out) - df = pd.read_csv(buildstock_csv_filename, index_col=0, dtype='str') + df = read_csv(buildstock_csv_filename, index_col=0, dtype='str') df_new = df[self.downselect_logic(df, self.logic)] if len(df_new.index) == 0: raise RuntimeError('There are no buildings left after the down select!') diff --git a/buildstockbatch/test/test_base.py b/buildstockbatch/test/test_base.py index 6fe0c7ce..f38d4032 100644 --- a/buildstockbatch/test/test_base.py +++ b/buildstockbatch/test/test_base.py @@ -20,6 +20,7 @@ from buildstockbatch.local import LocalBatch from buildstockbatch.exc import ValidationError from buildstockbatch.postprocessing import write_dataframe_as_parquet +from buildstockbatch.utils import read_csv dask.config.set(scheduler='synchronous') here = os.path.dirname(os.path.abspath(__file__)) @@ -51,7 +52,7 @@ def test_reference_scenario(basic_residential_project_file): # test results.csv files test_path = os.path.join(results_dir, 'results_csvs') - test_csv = pd.read_csv(os.path.join(test_path, 'results_up01.csv.gz')).set_index('building_id').sort_index() + test_csv = read_csv(os.path.join(test_path, 'results_up01.csv.gz')).set_index('building_id').sort_index() assert len(test_csv['apply_upgrade.reference_scenario'].unique()) == 1 assert test_csv['apply_upgrade.reference_scenario'].iloc[0] == 'example_reference_scenario' @@ -79,16 +80,16 @@ def simplify_columns(colname): reference_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'test_results', 'results_csvs') test_path = os.path.join(results_dir, 'results_csvs') - test_csv = pd.read_csv(os.path.join(test_path, 'results_up00.csv.gz')).rename(columns=simplify_columns).\ + test_csv = read_csv(os.path.join(test_path, 'results_up00.csv.gz')).rename(columns=simplify_columns).\ sort_values('buildingid').reset_index().drop(columns=['index']) - reference_csv = pd.read_csv(os.path.join(reference_path, 'results_up00.csv.gz')).rename(columns=simplify_columns).\ + reference_csv = read_csv(os.path.join(reference_path, 'results_up00.csv.gz')).rename(columns=simplify_columns).\ sort_values('buildingid').reset_index().drop(columns=['index']) mutul_cols = list(set(test_csv.columns).intersection(set(reference_csv))) pd.testing.assert_frame_equal(test_csv[mutul_cols], reference_csv[mutul_cols]) - test_csv = pd.read_csv(os.path.join(test_path, 'results_up01.csv.gz')).rename(columns=simplify_columns).\ + test_csv = read_csv(os.path.join(test_path, 'results_up01.csv.gz')).rename(columns=simplify_columns).\ sort_values('buildingid').reset_index().drop(columns=['index']) - reference_csv = pd.read_csv(os.path.join(reference_path, 'results_up01.csv.gz')).rename(columns=simplify_columns).\ + reference_csv = read_csv(os.path.join(reference_path, 'results_up01.csv.gz')).rename(columns=simplify_columns).\ sort_values('buildingid').reset_index().drop(columns=['index']) mutul_cols = list(set(test_csv.columns).intersection(set(reference_csv))) pd.testing.assert_frame_equal(test_csv[mutul_cols], reference_csv[mutul_cols]) @@ -186,15 +187,15 @@ def test_combine_files(basic_residential_project_file): reference_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'test_results', 'results_csvs') test_path = os.path.join(results_dir, 'results_csvs') - test_csv = pd.read_csv(os.path.join(test_path, 'results_up00.csv.gz')).sort_values('building_id').reset_index()\ + test_csv = read_csv(os.path.join(test_path, 'results_up00.csv.gz')).sort_values('building_id').reset_index()\ .drop(columns=['index']) - reference_csv = pd.read_csv(os.path.join(reference_path, 'results_up00.csv.gz')).sort_values('building_id')\ + reference_csv = read_csv(os.path.join(reference_path, 'results_up00.csv.gz')).sort_values('building_id')\ .reset_index().drop(columns=['index']) pd.testing.assert_frame_equal(test_csv, reference_csv) - test_csv = pd.read_csv(os.path.join(test_path, 'results_up01.csv.gz')).sort_values('building_id').reset_index()\ + test_csv = read_csv(os.path.join(test_path, 'results_up01.csv.gz')).sort_values('building_id').reset_index()\ .drop(columns=['index']) - reference_csv = pd.read_csv(os.path.join(reference_path, 'results_up01.csv.gz')).sort_values('building_id')\ + reference_csv = read_csv(os.path.join(reference_path, 'results_up01.csv.gz')).sort_values('building_id')\ .reset_index().drop(columns=['index']) pd.testing.assert_frame_equal(test_csv, reference_csv) @@ -398,7 +399,7 @@ def test_skipping_baseline(basic_residential_project_file): def test_provide_buildstock_csv(basic_residential_project_file, mocker): buildstock_csv = os.path.join(here, 'buildstock.csv') - df = pd.read_csv(buildstock_csv) + df = read_csv(buildstock_csv, dtype=str) project_filename, results_dir = basic_residential_project_file({ 'sampler': { 'type': 'precomputed', @@ -412,9 +413,9 @@ def test_provide_buildstock_csv(basic_residential_project_file, mocker): bsb = LocalBatch(project_filename) sampling_output_csv = bsb.sampler.run_sampling() - df2 = pd.read_csv(sampling_output_csv) + df2 = read_csv(sampling_output_csv, dtype=str) pd.testing.assert_frame_equal(df, df2) - + assert (df['Geometry Shared Walls'] == "None").all() # Verify None is being read properly # Test file missing with open(project_filename, 'r') as f: cfg = yaml.safe_load(f) diff --git a/buildstockbatch/test/test_eagle.py b/buildstockbatch/test/test_eagle.py index d8e914c4..f21c4f25 100644 --- a/buildstockbatch/test/test_eagle.py +++ b/buildstockbatch/test/test_eagle.py @@ -11,7 +11,7 @@ from buildstockbatch.eagle import user_cli, EagleBatch from buildstockbatch.base import BuildStockBatchBase -from buildstockbatch.utils import get_project_configuration +from buildstockbatch.utils import get_project_configuration, read_csv here = os.path.dirname(os.path.abspath(__file__)) @@ -281,8 +281,8 @@ def compare_ts_parquets(source, dst): compare_ts_parquets(file, results_file) # Check that buildstock.csv was trimmed properly - local_buildstock_df = pd.read_csv(results_dir / 'local_housing_characteristics_dir' / 'buildstock.csv') - unique_buildings = {x[0] for x in job_json['batch']} + local_buildstock_df = read_csv(results_dir / 'local_housing_characteristics_dir' / 'buildstock.csv', dtype=str) + unique_buildings = {str(x[0]) for x in job_json['batch']} assert len(unique_buildings) == len(local_buildstock_df) assert unique_buildings == set(local_buildstock_df['Building']) diff --git a/buildstockbatch/test/test_inputs/test_openstudio_buildstock/resources/buildstock_bad.csv b/buildstockbatch/test/test_inputs/test_openstudio_buildstock/resources/buildstock_bad.csv new file mode 100644 index 00000000..b63a2744 --- /dev/null +++ b/buildstockbatch/test/test_inputs/test_openstudio_buildstock/resources/buildstock_bad.csv @@ -0,0 +1,6 @@ +Building,Bedroom,Location,Vintage,State,Insulation Wall,Insulation +1,1,AL_Mobile-Rgnl.AP.722230,1940-1950,CO,Good Option,None +2,2,AL_Mobile-Rgnl.AP.722230,1940s,CO,Good Option,None +3,2,AL_Mobile-Rgnl.AP.722230,2010s,VA,Good Option,None +4,2,AL_Mobile-Rgnl.AP.722230,2000s,TX,,None +5,3,AL_Mobile-Rgnl.AP.722230,1970s,VA,,None \ No newline at end of file diff --git a/buildstockbatch/test/test_inputs/test_openstudio_buildstock/resources/buildstock_good.csv b/buildstockbatch/test/test_inputs/test_openstudio_buildstock/resources/buildstock_good.csv new file mode 100644 index 00000000..accc2816 --- /dev/null +++ b/buildstockbatch/test/test_inputs/test_openstudio_buildstock/resources/buildstock_good.csv @@ -0,0 +1,6 @@ +Building,Bedroom,Location,Vintage,State,Insulation Wall,Insulation Slab +1,1,AL_Mobile-Rgnl.AP.722230,<1950,CO,Good Option,None +2,3,AL_Mobile-Rgnl.AP.722230,1940s,CO,Good Option,None +3,2,AL_Mobile-Rgnl.AP.722230,2010s,VA,Good Option,None +4,1,AL_Mobile-Rgnl.AP.722230,2000s,VA,Good Option,None +5,2,AL_Mobile-Rgnl.AP.722230,1970s,VA,Good Option,None \ No newline at end of file diff --git a/buildstockbatch/test/test_inputs/test_openstudio_buildstock/resources/options_lookup.tsv b/buildstockbatch/test/test_inputs/test_openstudio_buildstock/resources/options_lookup.tsv index f8827f10..2f5e09f7 100644 --- a/buildstockbatch/test/test_inputs/test_openstudio_buildstock/resources/options_lookup.tsv +++ b/buildstockbatch/test/test_inputs/test_openstudio_buildstock/resources/options_lookup.tsv @@ -16,6 +16,9 @@ State VA State CO County County1 County County2 +Bedroom 1 +Bedroom 2 +Bedroom 3 Insulation Slab None Insulation Slab Good Option ResidentialConstructionsSlab perimeter_r=0 perimeter_width=0 whole_r=0 gap_r=0 exterior_r=0 exterior_depth=0 Insulation Slab Missing Argument ResidentialConstructionsSlab perimeter_r=0 perimeter_width=0 whole_r=10 gap_r=5 exterior_r=0 diff --git a/buildstockbatch/test/test_postprocessing.py b/buildstockbatch/test/test_postprocessing.py index dd6bdc56..7e6e2b6e 100644 --- a/buildstockbatch/test/test_postprocessing.py +++ b/buildstockbatch/test/test_postprocessing.py @@ -3,7 +3,6 @@ import json import logging import os -import pandas as pd import pathlib import re import tarfile @@ -13,7 +12,7 @@ from buildstockbatch import postprocessing from buildstockbatch.base import BuildStockBatchBase -from buildstockbatch.utils import get_project_configuration +from buildstockbatch.utils import get_project_configuration, read_csv postprocessing.performance_report = MagicMock() @@ -58,7 +57,7 @@ def test_report_additional_results_csv_columns(basic_residential_project_file): postprocessing.combine_results(fs, results_dir, cfg, do_timeseries=False) for upgrade_id in (0, 1): - df = pd.read_csv(str(results_dir / 'results_csvs' / f'results_up{upgrade_id:02d}.csv.gz')) + df = read_csv(str(results_dir / 'results_csvs' / f'results_up{upgrade_id:02d}.csv.gz')) assert (df['reporting_measure1.column_1'] == 1).all() assert (df['reporting_measure1.column_2'] == 2).all() assert (df['reporting_measure2.column_3'] == 3).all() diff --git a/buildstockbatch/test/test_validation.py b/buildstockbatch/test/test_validation.py index 76c328d7..f924c5a7 100644 --- a/buildstockbatch/test/test_validation.py +++ b/buildstockbatch/test/test_validation.py @@ -20,9 +20,9 @@ from buildstockbatch.eagle import EagleBatch from buildstockbatch.local import LocalBatch from buildstockbatch.base import BuildStockBatchBase, ValidationError -from buildstockbatch.eagle import EagleBatch +import pandas as pd from buildstockbatch.test.shared_testing_stuff import resstock_directory, resstock_required -from buildstockbatch.utils import get_project_configuration +from buildstockbatch.utils import get_project_configuration, read_csv from unittest.mock import patch from testfixtures import LogCapture from yamale.yamale_error import YamaleError @@ -31,7 +31,7 @@ here = os.path.dirname(os.path.abspath(__file__)) example_yml_dir = os.path.join(here, 'test_inputs') - +resources_dir = os.path.join(here, 'test_inputs', 'test_openstudio_buildstock', 'resources') def filter_logs(logs, level): filtered_logs = '' @@ -139,7 +139,7 @@ def test_good_reference_scenario(project_file): ]) def test_bad_measures(project_file): - with LogCapture(level=logging.INFO) as logs: + with LogCapture(level=logging.INFO) as _: try: BuildStockBatchBase.validate_workflow_generator(project_file) except (ValidationError, YamaleError) as er: @@ -304,3 +304,35 @@ def test_validate_eagle_output_directory(): with open(temp_yml, 'w') as f: yaml.dump(cfg, f, Dumper=yaml.SafeDumper) EagleBatch.validate_output_directory_eagle(str(temp_yml)) + + +def test_validate_sampler_good_buildstock(basic_residential_project_file): + project_filename, _ = basic_residential_project_file({ + 'sampler': { + 'type': 'precomputed', + 'args': { + 'sample_file': str(os.path.join(resources_dir, 'buildstock_good.csv')) + } + } + }) + assert BuildStockBatchBase.validate_sampler(project_filename) + +def test_validate_sampler_bad_buildstock(basic_residential_project_file): + project_filename, _ = basic_residential_project_file({ + 'sampler': { + 'type': 'precomputed', + 'args': { + 'sample_file': str(os.path.join(resources_dir, 'buildstock_bad.csv')) + } + } + }) + try: + BuildStockBatchBase.validate_sampler(project_filename) + except ValidationError as er: + er = str(er) + assert 'Option 1940-1950 in column Vintage of buildstock_csv is not available in options_lookup.tsv' in er + assert 'Option TX in column State of buildstock_csv is not available in options_lookup.tsv' in er + assert 'Option nan in column Insulation Wall of buildstock_csv is not available in options_lookup.tsv' in er + assert 'Column Insulation in buildstock_csv is not available in options_lookup.tsv' in er + else: + raise Exception("validate_options was supposed to raise ValidationError for enforce-validate-options-good.yml") diff --git a/buildstockbatch/utils.py b/buildstockbatch/utils.py index ef4bfb47..e2316df5 100644 --- a/buildstockbatch/utils.py +++ b/buildstockbatch/utils.py @@ -4,6 +4,8 @@ import logging import traceback import yaml +import pandas as pd + logger = logging.getLogger(__name__) @@ -14,6 +16,12 @@ class ContainerRuntime(enum.Enum): LOCAL_OPENSTUDIO = 3 +def read_csv(csv_file_path, **kwargs) -> pd.DataFrame: + default_na_values = pd._libs.parsers.STR_NA_VALUES + df = pd.read_csv(csv_file_path, na_values=list(default_na_values - {"None"}), keep_default_na=False, **kwargs) + return df + + def path_rel_to_file(startfile, x): if os.path.isabs(x): return os.path.abspath(x) diff --git a/docs/changelog/changelog_dev.rst b/docs/changelog/changelog_dev.rst index 75eeab3d..43042481 100644 --- a/docs/changelog/changelog_dev.rst +++ b/docs/changelog/changelog_dev.rst @@ -14,3 +14,11 @@ Development Changelog This is an example change. Please copy and paste it - for valid tags please refer to ``conf.py`` in the docs directory. ``pullreq`` should be set to the appropriate pull request number and ``tickets`` to any related github issues. These will be automatically linked in the documentation. + + .. change:: + :tags: bugfix, feature + :pullreq: 374 + :tickets: 373 + + Add read_csv function to utils to handle parsing "None" correctly with pandas > 2.0+. Also add a validator for + buildstock_csv that checks if all the entries are available in the options_lookup.tsv. diff --git a/setup.py b/setup.py index ab4510c7..c51b67a2 100644 --- a/setup.py +++ b/setup.py @@ -33,7 +33,7 @@ 'pyyaml', 'requests', 'numpy', - 'pandas', + 'pandas>=2', 'joblib', 'pyarrow', 'dask[complete]>=2022.10.0',