In [1]:
import sys
base_directory = "/Users/johnbarrera/Documents/Projects/world_bank/Climate-and-Disaster-Risk-Management-for-Health-Systems/"
sys.path.append(f"{base_directory}")

In [2]:
from src.utils.config_reader import ConfigReader, Logger
from src.utils.utils import GeoDataFrameOperations
from src.utils.file_pocessor import FileLister, FileProcessor

In [3]:
# Create an instance of the Logger
log_directory = "/Users/johnbarrera/Documents/Projects/world_bank/Climate-and-Disaster-Risk-Management-for-Health-Systems/data/nepal/outputs/log"
log_file_name = "preprocessing"
logger = Logger(log_directory, log_file_name)

In [4]:
# Readding the configuration file to the preprocessing
config_file_path = "config/nepal/setup_preprocessing.yaml"
config_file_path = f"{base_directory}{config_file_path}"

try:
    config_data = ConfigReader.read_yaml_file(config_file_path)
    config_data = config_data['preprocessing']
    txt_msg = "Content of {} file successfully read".format(config_file_path)
    logger.info(txt_msg)
except Exception as e:
    txt_msg = f"Error reading configuration file: {str(e)}"
    logger.error(txt_msg)

INFO: Content of /Users/johnbarrera/Documents/Projects/world_bank/Climate-and-Disaster-Risk-Management-for-Health-Systems/config/nepal/setup_preprocessing.yaml file successfully read


In [5]:
# Identifying modules for preprocessing
items_preprocessing = ['hazards', 'infrastructures', 'population']
hazards_task = None
infrastructures_task = None
population_task = None

items_config_data = [task for task in config_data]
try:
    for item in items_preprocessing:
        if item not in items_config_data:
            raise ValueError(f"Item {item} not found in config_data list")
    txt_msg = f"All elements are present in config_data list: {str(items_config_data)}"
    logger.info(txt_msg)
except ValueError as e:
    txt_msg = f"'{e}'"
    logger.error(txt_msg)
    

try:
    for task in items_config_data:
        if task == "hazards":
            hazards_task = config_data[task]
        elif task == "infrastructures":
            infrastructures_task = config_data[task]
        elif task == "population":
            population_task = config_data[task]
        else:
            raise ValueError(f"Unexpected task '{task}' found in config data")
    txt_msg = f"Tasks successfully extracted from config data"
    logger.info(txt_msg)
except KeyError as e:
    txt_msg = f"Error: '{e}' not found in config data"
    logger.error(txt_msg)
except ValueError as e:
    txt_msg = f"Error: {e}"
    logger.error(txt_msg)
except Exception as e:
    txt_msg = f"An unexpected error occurred: {str(e)}"
    logger.error(txt_msg)


INFO: All elements are present in config_data list: ['hazards', 'infrastructures', 'population']
INFO: Tasks successfully extracted from config data


In [6]:
# Write different types of messages to the log file
# logger.info('Information message')
# logger.warning('Warning message')
# logger.error('Error message')
# logger.critical('Critical message')
# logger.debug('Debug message')


In [7]:
# Processing Hazards task
for task in hazards_task:
    task_type = task['type']
    task_historical = task['historical']
    return_periods = task['return_periods']
    substantial_damage = task['substantial_damage']
    complete_destruction = task['complete_destruction']
    input_path = f"{base_directory}{task['source']}"
    output_path = f"{base_directory}{task['destination']}"

    if task_historical:
        txt_msg = f"The file contents historical records for {task_type}"
        files = FileLister.list_files(input_path)
        
        try:
            file = files[0]
        except Exception as e:
            txt_msg = f"Error: '{e}'"
            logger.error(txt_msg)
  
            
            
        # Read valid files
        file_name = file[0]
        file_path = file[1]  
            
        txt_msg = f"Processing historical file"
        print(txt_msg)
        
        # Transforming raster
        gdf = FileProcessor.read_tif(file_path, 'polygon')
        # Column damage generation
        gdf = GeoDataFrameOperations.calculate_damage(gdf, substantial_damage, complete_destruction)
        # Save file
        output_name = f'{task_type}_historical.gpkg'
        FileProcessor.save_to_geopackage(gdf, output_path, output_name)

        
    else:
        txt_msg = f"The file contents periodical records for {task_type}"
        files = FileLister.list_files(input_path)
        valid_files = []
        counters = 1
        for period in return_periods:
            validate_period = False
            counters = 0
            while not validate_period and counters <len(files):
                if str(period) in files[counters][0]:
                    validate_period = True
                    valid_files.append([period, files[counters][0], files[counters][1]])
                counters+=1
        
        if set(return_periods) != set([f[0] for f in valid_files]):
            elements_not_in_set = set(return_periods)  - set([f[0] for f in valid_files])
            txt_msg = f"Some periods files not found {elements_not_in_set}"
            print(txt_msg)
        else:
            txt_msg = f"All periods files found"
            print(txt_msg)
            
                    
        # Read valid files 
        for file in valid_files:
            file_period = file[0]
            file_name = file[1]
            file_path = file[2]
            
            txt_msg = f"Processing period {file_period}: {file_name}"
            print(txt_msg)
            
            txt_msg = f"{file_path}"
            print(txt_msg)
            
            # Transforming raster
            gdf = FileProcessor.read_tif(file_path, 'polygon')
            # Column damage generation
            gdf = GeoDataFrameOperations.calculate_damage(gdf, substantial_damage, complete_destruction)
            # print(gdf['value'].sum())
            # Save file
            output_name = f'{task_type}_period_{file_period}.gpkg'
            FileProcessor.save_to_geopackage(gdf, output_path, output_name)


All periods files found
Processing period 475: pga_2475.tif
/Users/johnbarrera/Documents/Projects/world_bank/Climate-and-Disaster-Risk-Management-for-Health-Systems/data/nepal/inputs/hazards/Global_earthquake_hazard_WB/pga_2475.tif


100%|██████████| 361/361 [00:01<00:00, 239.28it/s]


Processing period 975: pga_975.tif
/Users/johnbarrera/Documents/Projects/world_bank/Climate-and-Disaster-Risk-Management-for-Health-Systems/data/nepal/inputs/hazards/Global_earthquake_hazard_WB/pga_975.tif


100%|██████████| 361/361 [00:01<00:00, 243.21it/s]


Processing period 1500: pga_1500.tif
/Users/johnbarrera/Documents/Projects/world_bank/Climate-and-Disaster-Risk-Management-for-Health-Systems/data/nepal/inputs/hazards/Global_earthquake_hazard_WB/pga_1500.tif


100%|██████████| 361/361 [00:01<00:00, 226.51it/s]


Processing period 2475: pga_2475.tif
/Users/johnbarrera/Documents/Projects/world_bank/Climate-and-Disaster-Risk-Management-for-Health-Systems/data/nepal/inputs/hazards/Global_earthquake_hazard_WB/pga_2475.tif


100%|██████████| 361/361 [00:01<00:00, 231.64it/s]


Processing historical file


100%|██████████| 361/361 [00:01<00:00, 230.91it/s]


In [10]:
# Processing Population task
for task in population_task:
    task_type = task['type']
    input_path = f"{base_directory}{task['source']}"
    output_path = f"{base_directory}{task['destination']}"
    
    txt_msg = f"Processing population file: {input_path}"
    print(txt_msg)
    # Transforming raster
    gdf = FileProcessor.read_tif(input_path, 'polygon')
    gdf.columns = ['band', 'population', 'geometry']
    gdf['geometry'] = gdf.geometry.centroid
    gdf = gdf[['population', 'geometry']]
    
    
    output_name = f'{task_type}.gpkg'
    FileProcessor.save_to_geopackage(gdf, output_path, output_name)


Processing population file: /Users/johnbarrera/Documents/Projects/world_bank/Climate-and-Disaster-Risk-Management-for-Health-Systems/data/nepal/inputs/popu/nepal_npl_ct_popu_pop_sp_py_GHS_2023_p_u_Clipped_E2020_Nepal_4326.tif


100%|██████████| 4919/4919 [01:52<00:00, 43.77it/s]

  gdf['geometry'] = gdf.geometry.centroid


Processing population file: /Users/johnbarrera/Documents/Projects/world_bank/Climate-and-Disaster-Risk-Management-for-Health-Systems/data/nepal/inputs/popu/nepal_npl_ct_popu_pop_sp_py_GHS_2023_p_u_Clipped_E2020_Nepal_54009.tif


100%|██████████| 5511/5511 [01:52<00:00, 48.84it/s]

  gdf['geometry'] = gdf.geometry.centroid


In [9]:
# Processing Infrastructure task
for task in infrastructures_task:
    print(task)
    task_type = task['type']
    task_filter= task['filter']
    input_path = f"{base_directory}{task['source']}"
    output_path = f"{base_directory}{task['destination']}"
    
    gdf = FileProcessor.read_geopackage(input_path)
    if task_filter is not None:
        filter_str = task_filter[0]
        gdf = gdf.query(filter_str)
        gdf = gdf.reset_index(drop=True)
        
    txt_msg = f"Processing infrastructure file: {task_type}"
    print(txt_msg)
    output_name = f'{task_type}_infrastructure.gpkg'
    FileProcessor.save_to_geopackage(gdf, output_path, output_name)
    
    

{'type': 'PHC', 'filter': ["HF_T_RO=='Primary Health Care Center'"], 'source': 'data/nepal/inputs/heal/nepal_npl_ct_heal_heal_sp_tab_NDRRNA_14022024.gpkg', 'destination': 'data/nepal/outputs/infrastructure/'}
Processing infrastructure file: PHC
{'type': 'healt_facilities', 'filter': ["HF_T_RO!='Primary Health Care Center'"], 'source': 'data/nepal/inputs/heal/nepal_npl_ct_heal_heal_sp_tab_NDRRNA_14022024.gpkg', 'destination': 'data/nepal/outputs/infrastructure/'}
Processing infrastructure file: healt_facilities
