### Metadata Management for Data Quality
**Description**: Store and use metadata to manage data quality in a pipeline.

**Steps**:
1. Load metadata
2. Load data
3. Use metadata to validate data quality
4. Show valid data


In [3]:
import pandas as pd

def load_metadata(metadata_path):
    try:
        metadata = pd.read_csv(metadata_path)
        # Basic sanity check
        required_cols = {'column_name', 'data_type', 'nullable'}
        if not required_cols.issubset(set(metadata.columns)):
            raise ValueError(f"Metadata missing required columns: {required_cols}")
        return metadata
    except Exception as e:
        print(f"[Error] Failed to load metadata: {e}")
        return None

def load_data(data_path):
    try:
        data = pd.read_csv(data_path)
        return data
    except Exception as e:
        print(f"[Error] Failed to load data: {e}")
        return None

def safe_cast(value, to_type):
    """Try to cast value to the given type safely."""
    try:
        if to_type == 'int':
            # Convert float strings to int if possible
            if isinstance(value, float) and value.is_integer():
                return int(value)
            return int(value)
        elif to_type == 'float':
            return float(value)
        elif to_type == 'str':
            return str(value)
        else:
            return value  # unknown types just return original
    except (ValueError, TypeError):
        return None

def validate_data_quality(data, metadata):
    valid_rows_mask = pd.Series([True] * len(data))

    for _, row in metadata.iterrows():
        col = row['column_name']
        expected_type = row['data_type'].lower()
        nullable = str(row['nullable']).lower() == 'true'

        if col not in data.columns:
            print(f"[Warning] Column '{col}' missing in data.")
            valid_rows_mask &= False
            continue

        col_values = data[col]

        # Nullability check
        if not nullable:
            non_null_mask = col_values.notnull()
            valid_rows_mask &= non_null_mask
            if not non_null_mask.all():
                print(f"[Error] Column '{col}' has null values but is not nullable.")

        # Type check with safe casting
        casted_values = col_values.apply(lambda x: safe_cast(x, expected_type))
        type_valid_mask = casted_values.notnull()
        valid_rows_mask &= type_valid_mask
        if not type_valid_mask.all():
            invalid_count = (~type_valid_mask).sum()
            print(f"[Error] Column '{col}' has {invalid_count} values not castable to {expected_type}.")

        # Range checks if applicable
        for bound in ['min_value', 'max_value']:
            if bound in row and pd.notna(row[bound]):
                try:
                    bound_val = float(row[bound])
                    if bound == 'min_value':
                        range_mask = casted_values >= bound_val
                    else:
                        range_mask = casted_values <= bound_val
                    valid_rows_mask &= range_mask.fillna(False)
                    if not range_mask.all():
                        print(f"[Error] Column '{col}' has values outside {bound} {bound_val}.")
                except Exception as e:
                    print(f"[Warning] Could not apply {bound} check for column '{col}': {e}")

    valid_data = data[valid_rows_mask]
    invalid_count = len(data) - len(valid_data)
    print(f"Validation complete: {len(valid_data)} valid rows, {invalid_count} invalid rows removed.")
    return valid_data


# Minimal test function (expand with more cases as needed)
def test_validate_data_quality():
    from io import StringIO

    metadata_csv = """column_name,data_type,nullable,min_value,max_value
id,int,False,,
age,int,False,0,120
name,str,False,,
balance,float,False,0,
"""

    data_csv = """id,age,name,balance
1,25,Alice,1000.5
2,130,Bob,2000.0
3,,Charlie,1500.0
4,35,,500.0
5,29,Eve,1200.0
6,28,Frank,-50.0
7,abc,Gary,300.0
"""

    metadata = pd.read_csv(StringIO(metadata_csv))
    data = pd.read_csv(StringIO(data_csv))

    valid_data = validate_data_quality(data, metadata)
    print("\nValid data rows:")
    print(valid_data)

if __name__ == "__main__":
    # Example usage:
    # metadata = load_metadata("metadata.csv")
    # data = load_data("data.csv")
    # if metadata is not None and data is not None:
    #     valid_data = validate_data_quality(data, metadata)
    test_validate_data_quality()


[Error] Column 'age' has null values but is not nullable.
[Error] Column 'age' has 2 values not castable to int.
[Error] Column 'age' has values outside min_value 0.0.
[Error] Column 'age' has values outside max_value 120.0.
[Error] Column 'name' has null values but is not nullable.
[Error] Column 'balance' has values outside min_value 0.0.
Validation complete: 2 valid rows, 5 invalid rows removed.

Valid data rows:
   id age   name  balance
0   1  25  Alice   1000.5
4   5  29    Eve   1200.0
