Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
52 changes: 41 additions & 11 deletions src/schema_validata.py
Original file line number Diff line number Diff line change
Expand Up @@ -277,6 +277,29 @@ def get_spreadsheet_metadata(file_path):

# ----------------------------------------------------------------------------------

def is_numeric_type(value):
"""
Checks if a value is a common numeric data type in
pandas, NumPy, or Python.

Parameters:
----------
value: The value to check.
Returns:
-------
bool: True if the value is numeric, False otherwise.
"""
# Check for standard numeric types (int, float, complex)
if isinstance(value, (int, float, complex)):
return True
# Check for NumPy numeric dtypes using np.issubdtype
elif np.issubdtype(type(value), np.number):
return True
else:
return False

# ----------------------------------------------------------------------------------

def downcast_ints(value):
"""
Downcast a numeric value to an integer if it is equal to
Expand Down Expand Up @@ -1214,7 +1237,7 @@ def series_hasNull(series,

#----------------------------------------------------------------------------------

def get_numeric_range(series,
def get_numeric_range(pd_series,
attribute,
na_val=None
):
Expand All @@ -1223,7 +1246,7 @@ def get_numeric_range(series,
numerical and non-numerical cases.

Parameters:
series (pd.Series):
pd_series (pd.Series):
The Pandas Series to process.
attribute (str):
The desired statistical attribute, either 'min' or 'max'.
Expand All @@ -1238,12 +1261,17 @@ def get_numeric_range(series,
value as an integer if possible; otherwise, returns it as a float. If the
Series is empty or non-numeric, returns (na_val).
"""
_s = series.dropna()
# Check for integers or float
_s = pd_series.replace(r'^\s+$', pd.NA, regex=True)
_s.fillna(pd.NA)
try:
_s = pd.to_numeric(_s)
_s.fillna(pd.NA)
except:
pass


_s = _s.dropna()

if not pd.api.types.is_numeric_dtype(_s):
return na_val # Return `na_val` for non-numeric Series

Expand Down Expand Up @@ -1817,9 +1845,9 @@ def schema_validate_range(attribute,
"""

# Check if the expected range is a numeric value
if isinstance(p_errors[attribute]['expected'], (int, float)):
if is_numeric_type(p_errors[attribute]['expected']):
# Check if the observed value is also a numeric value
if isinstance(p_errors[attribute]['observed'], (int, float)):
if is_numeric_type(p_errors[attribute]['observed']):
exp_val = p_errors[attribute]['expected']
obs_val = p_errors[attribute]['observed']

Expand Down Expand Up @@ -2290,9 +2318,11 @@ def value_errors_out_of_range(df,
results = []

# Check for integers or float
numeric_column = df[column_name].notna()
numeric_column = df[column_name].replace(r'^\s+$', pd.NA, regex=True)
numeric_column.fillna(pd.NA)
try:
numeric_column = pd.to_numeric(numeric_column)
numeric_column.fillna(pd.NA)
except:
pass

Expand Down Expand Up @@ -2463,20 +2493,20 @@ def get_value_errors(dataset_path,
)
if 'range_max' in flagged_errs \
and 'range_max' not in ignore_errors:
max_len = errors['range_max']['expected']
rng_max = errors['range_max']['expected']
sheet_v_errors.append(
value_errors_out_of_range(df, col,
test_type='max',
value=max_len,
value=rng_max,
unique_column=unique_column)
)
if 'range_min' in flagged_errs \
and 'range_min' not in ignore_errors:
min_len = errors['range_min']['expected']
rng_min = errors['range_min']['expected']
sheet_v_errors.append(
value_errors_out_of_range(df, col,
test_type='min',
value=min_len,
value=rng_min,
unique_column=unique_column)
)
if 'allowed_value_list' in flagged_errs \
Expand Down