### Metadata Management for Data Quality
**Description**: Store and use metadata to manage data quality in a pipeline.

**Steps**:
1. Load metadata
2. Load data
3. Use metadata to validate data quality
4. Show valid data


In [2]:
import pandas as pd

metadata = {
    "columns": {
        "id": {"type": "int", "required": True},
        "name": {"type": "str", "required": True},
        "age": {"type": "int", "required": False},
        "email": {"type": "str", "required": False}
    }
}

data = pd.DataFrame({
    "id": [1, 2, None, 4],
    "name": ["Alice", "Bob", "Charlie", None],
    "age": [25, 30, 22, 28],
    "email": ["a@example.com", "b@example.com", "c@example.com", "d@example.com"]
})

valid_data = data.copy()
for col, rules in metadata["columns"].items():
    if col not in valid_data.columns:
        continue
    if rules["required"]:
        valid_data = valid_data[valid_data[col].notnull()]
    if rules["type"] == "int":
        valid_data = valid_data[valid_data[col].apply(lambda x: isinstance(x, (int, float)) and not pd.isna(x))]
    elif rules["type"] == "str":
        valid_data = valid_data[valid_data[col].apply(lambda x: isinstance(x, str))]

print(valid_data)

    id   name  age          email
0  1.0  Alice   25  a@example.com
1  2.0    Bob   30  b@example.com
