In [6]:
# Title: Define Data Quality KPIs

# Step 1: Import necessary libraries
import pandas as pd
import numpy as np

# Step 2: Sample dataset (replace with your actual dataset if needed)
data = {
    'CustomerID': [1, 2, 3, 4, None],
    'Name': ['Alice', 'Bob', 'Charlie', None, 'Eva'],
    'Email': ['alice@example.com', 'bob@example.com', 'charlie@example.com', 'bob@example.com', None],
    'Age': [25, None, 35, 29, 42],
    'Country': ['USA', 'USA', None, 'Canada', 'USA']
}
df = pd.DataFrame(data)

# Step 3: Define KPI Functions

# 1. Completeness: % of non-null values
def completeness_score(df):
    total_values = df.size
    non_null_values = df.notnull().sum().sum()
    return round((non_null_values / total_values) * 100, 2)

# 2. Uniqueness: % of unique IDs or key fields
def uniqueness_score(df, column):
    if column in df.columns:
        total = df[column].dropna().shape[0]
        unique = df[column].dropna().nunique()
        return round((unique / total) * 100, 2)
    return None

# 3. Accuracy: % values within valid range (example: age between 0 and 120)
def accuracy_score(df):
    if 'Age' in df.columns:
        valid_ages = df['Age'].dropna().between(0, 120)
        return round((valid_ages.sum() / valid_ages.shape[0]) * 100, 2)
    return None

# 4. Consistency: Example — email column has valid format
def consistency_score(df):
    if 'Email' in df.columns:
        pattern = r'^[\w\.-]+@[\w\.-]+\.\w+$'
        valid_emails = df['Email'].dropna().str.match(pattern)
        return round((valid_emails.sum() / valid_emails.shape[0]) * 100, 2)
    return None

# Step 4: Calculate KPIs
completeness = completeness_score(df)
uniqueness = uniqueness_score(df, 'CustomerID')
accuracy = accuracy_score(df)
consistency = consistency_score(df)

# Step 5: Display KPI Results
kpi_results = {
    'Completeness (%)': completeness,
    'Uniqueness (CustomerID) (%)': uniqueness,
    'Accuracy (Age) (%)': accuracy,
    'Consistency (Email format) (%)': consistency
}

kpi_df = pd.DataFrame(list(kpi_results.items()), columns=['Data Quality KPI', 'Score'])
print(kpi_df)

                 Data Quality KPI  Score
0                Completeness (%)   80.0
1     Uniqueness (CustomerID) (%)  100.0
2              Accuracy (Age) (%)  100.0
3  Consistency (Email format) (%)  100.0
