### Metadata Management for Data Quality
**Description**: Store and use metadata to manage data quality in a pipeline.

**Steps**:
1. Load metadata
2. Load data
3. Use metadata to validate data quality
4. Show valid data


In [2]:
# write your code from here
import pandas as pd

def load_metadata(metadata_path):
    """
    Load metadata describing schema and quality constraints.
    Expected format (CSV or JSON), e.g.:
    column_name, data_type, nullable, min_value, max_value
    age, int, False, 0, 120
    email, str, True, ,
    balance, float, False, 0.0, 
    """
    try:
        metadata = pd.read_csv(metadata_path)
        return metadata
    except Exception as e:
        print(f"Failed to load metadata: {e}")
        return None

def load_data(data_path):
    try:
        data = pd.read_csv(data_path)
        return data
    except Exception as e:
        print(f"Failed to load data: {e}")
        return None

def validate_data_quality(data, metadata):
    valid_rows_mask = pd.Series([True] * len(data))
    
    for _, row in metadata.iterrows():
        col = row['column_name']
        expected_type = row['data_type']
        nullable = row['nullable'] == 'True' or row['nullable'] == True
        
        if col not in data.columns:
            print(f"Warning: Column '{col}' not found in data.")
            valid_rows_mask &= False
            continue
        
        # Check nullability
        if not nullable:
            non_null_mask = data[col].notnull()
            valid_rows_mask &= non_null_mask
            if not non_null_mask.all():
                print(f"Column '{col}' has nulls but is not nullable.")
        
        # Check data type
        if expected_type == 'int':
            valid_type_mask = data[col].dropna().apply(lambda x: isinstance(x, int) or (isinstance(x, float) and x.is_integer()))
        elif expected_type == 'float':
            valid_type_mask = data[col].dropna().apply(lambda x: isinstance(x, float) or isinstance(x, int))
        elif expected_type == 'str':
            valid_type_mask = data[col].dropna().apply(lambda x: isinstance(x, str))
        else:
            # Unknown type, skip
            valid_type_mask = pd.Series([True] * data[col].dropna().shape[0])
        
        valid_rows_mask.loc[valid_type_mask.index] &= valid_type_mask
        
        # Optional: min/max value checks if applicable
        if pd.notna(row.get('min_value')):
            min_val = float(row['min_value'])
            valid_rows_mask &= data[col] >= min_val
        
        if pd.notna(row.get('max_value')):
            max_val = float(row['max_value'])
            valid_rows_mask &= data[col] <= max_val
    
    valid_data = data[valid_rows_mask]
    invalid_count = len(data) - len(valid_data)
    print(f"Valid rows: {len(valid_data)}; Invalid rows removed: {invalid_count}")
    return valid_data

if __name__ == "__main__":
    metadata_path = "metadata.csv"
    data_path = "data.csv"
    
    metadata = load_metadata(metadata_path)
    if metadata is not None:
        data = load_data(data_path)
        if data is not None:
            valid_data = validate_data_quality(data, metadata)
            print("\nValid data preview:")
            print(valid_data.head())


Column 'name' has nulls but is not nullable.
Column 'age' has nulls but is not nullable.
Valid rows: 2; Invalid rows removed: 4

Valid data preview:
   id   name   age              email  balance
0   1  Alice  25.0  alice@example.com   1000.5
4   5    Eve  29.0                NaN   1200.0
