In [1]:
import pandas as pd
import numpy as np
import sys
import os
import re

In [14]:
# inputs
path_general = '/scratch/mia725/calibration_workflow/Smoky_river/gistool-outputs/'
num_land_cover = 19
minimume_land_fraction = 0.05 # fraction of land cover under which the fraction is set to 0 and other fractions are normalized
num_soil_type = 12
unify_soil = True # if soil is not used in GRU creation, set to true to True, all the subbasin will be allocated by most dominant soil

### sanity check for soil type. It is possible that the soil type is set to 0 or unknown for lakes or water bodies. In this study we replace that with the majority of soil type in the domain (including possible NaN values).

In [15]:
path_soil_type = path_general + 'domain_stats_soil_classes.csv'
path_landcover_type = path_general + 'domain_stats_NA_NALCMS_landcover_2020_30m.csv'
path_elevation_mean = path_general + 'domain_stats_elv.csv'

soil_type = pd.read_csv(path_soil_type)
landcover_type = pd.read_csv(path_landcover_type)
elevation_mean = pd.read_csv(path_elevation_mean)

# # should be removed when gistool is fixed for mertihydro
# from copy import deepcopy
# elevation_mean = deepcopy(soil_type)
# elevation_mean = elevation_mean[['COMID']]
# elevation_mean['min']=1;elevation_mean['max']=10;elevation_mean['mean']=5;elevation_mean['median']=5
# if not os.path.isdir(path_general+'merit_hydro'):
#     os.makedirs(path_general+'merit_hydro')
# elevation_mean.to_csv(path_general + 'merit_hydro/domain_stats_elv.csv')

soil_type = soil_type.sort_values(by='COMID').reset_index(drop=True)
landcover_type = landcover_type.sort_values(by='COMID').reset_index(drop=True)
elevation_mean = elevation_mean.sort_values(by='COMID').reset_index(drop=True)


# check if all the COMID are similar in all the three files (from similar shapefile)

In [16]:
# check if COMIDs are the similar
# check the len
if len(soil_type) != len(landcover_type) or len(landcover_type) != len(elevation_mean):
    sys.exit('The provided length of soil and land cover is not identical')

# check if the COMIDs are similar
if sum(soil_type['COMID'].values - landcover_type['COMID'].values) != 0 or \
sum(landcover_type['COMID'].values - elevation_mean['COMID'].values) != 0:
    sys.exit('The COMID of the shapefile in soil and land cover is not identical')

# soil maps sanity check

### if there is NaN replace with majority soil types in the domain
### if there is 0, unidentified, replae with majority soil types in the domain

In [17]:
# Check if all values in 'majority' are either NaN or zero
is_all_nan_or_zero = soil_type['majority'].isnull().all() or (soil_type['majority'] == 0).all()

if is_all_nan_or_zero:
    sys.exit("All values in 'majority' are either NaN or zero.")
    
# check if there is NaN in values
has_nan = soil_type['majority'].isna().any()

if has_nan:
    print("The 'majority' column has NaN values will be replace with majority.")

# Find the majority value excluding NaN and zeros
majority_value = soil_type['majority'].replace(0, np.nan).mode().iloc[0]

# Replace 0 values with the majority value
soil_type['majority'] = soil_type['majority'].replace(0, majority_value)

# Replace NaN values with the majority value
soil_type['majority'].fillna(majority_value, inplace=True)

# unify soil
if unify_soil:
    soil_type['majority'] = majority_value

# save the modified file
# get the file name and it path separaeted:
path_soil_type_path_name = os.path.dirname(path_soil_type)
path_soil_type_file_name = os.path.basename(path_soil_type)
soil_type.to_csv(path_soil_type_path_name+'/modified_'+path_soil_type_file_name, index=False)
#soil_type.to_csv(path_soil_type_path_name+'/'+path_soil_type_file_name, index=False)

# land cover map

### land cover map rescaling for fraction larger than a given minimum fraction

In [18]:
# land cover sanity check and renormalization
# Possible normalization of land cover fraction above a certain threshold
for col in landcover_type.columns:
    if col.startswith('frac_'):
        landcover_type[col] = landcover_type[col].apply(lambda x: 0 if x < minimume_land_fraction else x)

# Second iteration: Normalize non-zero values based on row sums
for index, row in landcover_type.iterrows():
    frac_columns = [col for col in landcover_type.columns if col.startswith('frac_')]
    row_sum = row[frac_columns].sum()
    if row_sum > 0:
        for col in frac_columns:
            landcover_type.at[index, col] /= row_sum
            
# add non existing columns and one line of non zero values
missing_columns = [f"frac_{i}" for i in range(1, num_land_cover+1) if f"frac_{i}" not in landcover_type.columns]
for col in missing_columns:
    landcover_type[col] = 0
    
# Sort columns that start with "frac_" based on the numbers at the end
frac_columns = [col for col in landcover_type.columns if re.match(r'^frac_\d+$', col)]
frac_columns.sort(key=lambda x: int(re.search(r'\d+$', x).group()))

# Reorder DataFrame columns with sorted "frac_" columns
sorted_columns = [col for col in landcover_type.columns if col not in frac_columns] + frac_columns
landcover_type = landcover_type.reindex(columns=sorted_columns)

# replace the first line zeros with minimum values for CLASS to run
for col in frac_columns:
    if landcover_type.loc[0, col] < 0.00001:
        landcover_type.loc[0, col] = 0.00001
            
# save the modified file
# get the file name and it path separaeted:
path_landcover_type_path_name = os.path.dirname(path_landcover_type)
path_landcover_type_file_name = os.path.basename(path_landcover_type)
landcover_type.to_csv(path_landcover_type_path_name+'/modified_'+path_landcover_type_file_name, index=False)
#landcover_type.to_csv(path_landcover_type_path_name+'/'+path_landcover_type_file_name, index=False)


# Mean value of elevation set to zero if NaN (wont affect mizuRoute routing).

### assumes the shapfile in open water or sea level.

In [19]:
has_nan = elevation_mean['mean'].isna().any()

if has_nan:
    print("The 'mean' column has NaN values will be replace by 0.")
    
elevation_mean['mean'].fillna(0, inplace=True)


# save the modified file
# get the file name and it path separaeted:
path_elevation_mean_path_name = os.path.dirname(path_elevation_mean)
path_elevation_mean_file_name = os.path.basename(path_elevation_mean)
elevation_mean.to_csv(path_elevation_mean_path_name+'/modified_'+path_elevation_mean_file_name, index=False)
#elevation_mean.to_csv(path_elevation_mean_path_name+'/'+path_elevation_mean_file_name, index=False)