## Data Quality Framework Implementation

**Description**: Implement a simple data quality measurement framework using ISO 8000 principles to assess key dimensions in a dataset.

In [2]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import re

# ------------------------------
# Custom Sample Dataset
# ------------------------------
data = pd.DataFrame({
    "customer_id": [1, 2, 3, 3, 5],
    "name": ["Alice", "Bob", "Charlie", None, "Eve"],
    "email": ["alice@example.com", "bob@example", "charlie@example.com", "charlie@example.com", "eve@example.com"],
    "last_updated": ["2023-05-01", "2021-01-01", "2024-03-01", "2022-12-15", "2023-11-20"],
    "phone": ["(123) 456-7890", "(234) 567-8901", "123-456-7890", "(345) 678-9012", "(456) 789-0123"]
})

# Convert date column to datetime
data["last_updated"] = pd.to_datetime(data["last_updated"], errors='coerce')

# ------------------------------
# Initialize Scores
# ------------------------------
scores = {}

# ------------------------------
# 1. Completeness
# ------------------------------
completeness = data.notnull().mean().mean() * 100
scores["Completeness"] = round(completeness, 2)

# ------------------------------
# 2. Accuracy (Simple: valid email contains '@' and '.')
# ------------------------------
def is_email_valid(email):
    return isinstance(email, str) and "@" in email and "." in email

accuracy = data["email"].apply(is_email_valid).mean() * 100
scores["Accuracy"] = round(accuracy, 2)

# ------------------------------
# 3. Consistency (email must be same for same customer_id)
# ------------------------------
consistency_issues = 0
grouped = data.groupby("customer_id")["email"].nunique()
if any(grouped > 1):
    consistency_issues = sum(grouped > 1)
consistency = ((len(grouped) - consistency_issues) / len(grouped)) * 100
scores["Consistency"] = round(consistency, 2)

# ------------------------------
# 4. Validity (phone must match (XXX) XXX-XXXX)
# ------------------------------
phone_pattern = re.compile(r"^\(\d{3}\) \d{3}-\d{4}$")
validity = data["phone"].apply(lambda x: bool(phone_pattern.match(str(x)))).mean() * 100
scores["Validity"] = round(validity, 2)

# ------------------------------
# 5. Timeliness (must be updated within 1 year from today)
# ------------------------------
cutoff_date = datetime.now() - timedelta(days=365)
timeliness = (data["last_updated"] > cutoff_date).mean() * 100
scores["Timeliness"] = round(timeliness, 2)

# ------------------------------
# 6. Uniqueness (check duplicate customer_id)
# ------------------------------
uniqueness = (1 - data.duplicated(subset=["customer_id"]).mean()) * 100
scores["Uniqueness"] = round(uniqueness, 2)

# ------------------------------
# Overall Data Quality Score
# ------------------------------
overall = round(np.mean(list(scores.values())), 2)
scores["Overall"] = overall

# ------------------------------
# Output Results
# ------------------------------
for dimension, score in scores.items():
    print(f"{dimension}: {score}%")

Completeness: 96.0%
Accuracy: 80.0%
Consistency: 100.0%
Validity: 80.0%
Timeliness: 0.0%
Uniqueness: 80.0%
Overall: 72.67%
