In [2]:
import pandas as pd
import numpy as np

# Sample DataFrame (Replace this with your actual dataset)
data = {
    'Name': ['Alice', 'Bob', 'Charlie', 'David', None],
    'Age': [25, 30, None, 22, 28],
    'Email': ['alice@example.com', 'bob@example.com', 'charlie@example.com', 'david@example.com', 'bob@example.com'],
    'Salary': [50000, 60000, 55000, None, 52000]
}
df = pd.DataFrame(data)

# --- 1. Completeness Score ---
def compute_completeness(df):
    total_cells = df.size
    missing_cells = df.isnull().sum().sum()
    completeness_score = ((total_cells - missing_cells) / total_cells) * 100
    return round(completeness_score, 2)

# --- 2. Accuracy Score ---
def compute_accuracy(df):
    # Define expected ranges or valid values for columns
    accuracy_checks = {
        'Age': lambda x: x.between(0, 120),
        'Salary': lambda x: x.between(30000, 200000)
    }
    total_checks = 0
    valid_checks = 0
    for column, check in accuracy_checks.items():
        if column in df.columns:
            valid = check(df[column].dropna())
            total_checks += valid.size
            valid_checks += valid.sum()
    if total_checks == 0:
        return 100.0
    accuracy_score = (valid_checks / total_checks) * 100
    return round(accuracy_score, 2)

# --- 3. Consistency Score ---
def compute_consistency(df):
    # Example: Check for duplicate emails (should be unique)
    if 'Email' in df.columns:
        total_emails = df['Email'].dropna().size
        unique_emails = df['Email'].dropna().nunique()
        consistency_score = (unique_emails / total_emails) * 100
        return round(consistency_score, 2)
    return 100.0

# --- Aggregate Data Quality Score ---
def compute_data_quality_score(df):
    completeness = compute_completeness(df)
    accuracy = compute_accuracy(df)
    consistency = compute_consistency(df)
    overall_score = round((completeness + accuracy + consistency) / 3, 2)
    return {
        'Completeness': completeness,
        'Accuracy': accuracy,
        'Consistency': consistency,
        'Overall Data Quality Score': overall_score
    }

# Compute and display the data quality scores
scores = compute_data_quality_score(df)
for metric, score in scores.items():
    print(f"{metric}: {score}%")

Completeness: 85.0%
Accuracy: 100.0%
Consistency: 80.0%
Overall Data Quality Score: 88.33%
