## Build a Data Quality Dashboard

**Description**: Create a simple dashboard that displays data quality metrics using a library like `dash` or `streamlit`.

**Steps:**
1. Install Streamlit: pip install streamlit
2. Create a Python script dashboard.py.
3. Run the dashboard: streamlit run dashboard.py

In [18]:
import pandas as pd
import re

# --------- Validation Helper ---------
def validate_dataframe(df):
    if not isinstance(df, pd.DataFrame):
        raise ValueError("Input must be a pandas DataFrame.")
    if df.empty or df.isnull().all().all():
        return False
    return True

# --------- Completeness Metric ---------
def calculate_completeness(df):
    if not validate_dataframe(df):
        return 0.0
    total_cells = df.size
    non_missing = df.notnull().sum().sum()
    return round(non_missing / total_cells, 2)

# --------- Accuracy Metric ---------
def calculate_accuracy(df):
    if not validate_dataframe(df):
        return 0.0
    if 'email' not in df.columns:
        return 0.0

    pattern = r'^[\w\.-]+@[\w\.-]+\.\w+$'
    valid_emails = df['email'].dropna().apply(lambda x: bool(re.match(pattern, str(x))))
    return round(valid_emails.sum() / len(valid_emails), 2) if len(valid_emails) > 0 else 0.0

# --------- Consistency Metric ---------
def calculate_consistency(df):
    if not validate_dataframe(df):
        return 0.0
    if 'grade' not in df.columns:
        return 0.0

    valid_grades = {'A', 'B', 'C', 'D', 'E', 'F'}
    consistent_grades = df['grade'].dropna().apply(lambda x: x in valid_grades)
    return round(consistent_grades.sum() / len(consistent_grades), 2) if len(consistent_grades) > 0 else 0.0

# --------- Combined DQI Score ---------
def calculate_dqi(df):
    completeness = calculate_completeness(df)
    accuracy = calculate_accuracy(df)
    consistency = calculate_consistency(df)
    dqi_score = round((completeness + accuracy + consistency) / 3, 2)
    return {
        "completeness": completeness,
        "accuracy": accuracy,
        "consistency": consistency,
        "dqi_score": dqi_score
    }

# --------- Sample Usage ---------
df = pd.read_csv('data_quality_sample.csv')
score = calculate_dqi(df)
print(score)


{'completeness': 0.95, 'accuracy': 0.89, 'consistency': 0.8, 'dqi_score': 0.88}


In [19]:
# dq_metrics.py

import pandas as pd
import re

def validate_dataframe(df):
    if not isinstance(df, pd.DataFrame):
        raise ValueError("Input must be a pandas DataFrame.")
    if df.empty or df.isnull().all().all():
        return False
    return True

def calculate_completeness(df):
    if not validate_dataframe(df):
        return 0.0
    total_cells = df.size
    non_missing = df.notnull().sum().sum()
    return round(non_missing / total_cells, 2)

def calculate_accuracy(df):
    if not validate_dataframe(df):
        return 0.0
    if 'email' not in df.columns:
        return 0.0
    pattern = r'^[\w\.-]+@[\w\.-]+\.\w+$'
    valid_emails = df['email'].dropna().apply(lambda x: bool(re.match(pattern, str(x))))
    return round(valid_emails.sum() / len(valid_emails), 2) if len(valid_emails) > 0 else 0.0

def calculate_consistency(df):
    if not validate_dataframe(df):
        return 0.0
    if 'grade' not in df.columns:
        return 0.0
    valid_grades = {'A', 'B', 'C', 'D', 'E', 'F'}
    consistent_grades = df['grade'].dropna().apply(lambda x: x in valid_grades)
    return round(consistent_grades.sum() / len(consistent_grades), 2) if len(consistent_grades) > 0 else 0.0

def calculate_dqi(df):
    completeness = calculate_completeness(df)
    accuracy = calculate_accuracy(df)
    consistency = calculate_consistency(df)
    dqi_score = round((completeness + accuracy + consistency) / 3, 2)
    return {
        "completeness": completeness,
        "accuracy": accuracy,
        "consistency": consistency,
        "dqi_score": dqi_score
    }


In [20]:
import pandas as pd

data = {
    "id": [1,2,3,4,5,6,7,8,9,10],
    "name": ["Alice", "Bob", "Charlie", "David", "Eve", "Frank", "Grace", "Hank", "Ivy", "Jack"],
    "age": [25, None, 30, 27, None, 22, 29, 35, 28, 31],
    "email": ["alice@example.com", "bob[at]example.com", None, "david@example.com", "eve@example.com",
              "frank@example.com", "grace@example.com", "hank@example.com", "ivy@example.com", "jack@example.com"],
    "gender": ["F", "M", "M", "M", "F", "M", "F", "M", "F", "M"],
    "grade": ["A", "B", "C", "A", "B", "D", "E", "F", "G", "H"]
}

df = pd.DataFrame(data)
df.to_csv("data_quality_sample.csv", index=False)
